Created
October 7, 2025 07:05
-
-
Save Chubek/aa40085d585d474bc26440cf93696f60 to your computer and use it in GitHub Desktop.
x86-64 book
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <!DOCTYPE html> | |
| <html xmlns="http://www.w3.org/1999/xhtml" lang="" xml:lang=""> | |
| <head> | |
| <meta charset="utf-8" /> | |
| <meta name="generator" content="pandoc" /> | |
| <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes" /> | |
| <!-- KaTeX CSS --> | |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-n8MVd4RsNIU0tAv4ct0nTaAbDJwPJzDEaqSD1odI+WdtXRGWt2kTvGFasHpSy3SV" crossorigin="anonymous"> | |
| <!-- KaTeX JavaScript --> | |
| <title>Dossier - x86asm-dossier</title> | |
| <style> | |
| :root { | |
| /* Dracula color palette */ | |
| --bg-color: #1a1b26; | |
| --current-line: #24283b; | |
| --selection: #364a82; | |
| --foreground: #c0caf5; | |
| --comment: #565f89; | |
| --cyan: #8be9fd; | |
| --green: #50fa7b; | |
| --orange: #ffb86c; | |
| --pink: #ff79c6; | |
| --purple: #bd93f9; | |
| --red: #ff5555; | |
| --yellow: #f1fa8c; | |
| /* Semantic colors */ | |
| --text-color: var(--foreground); | |
| --heading-color: var(--purple); | |
| --link-color: var(--cyan); | |
| --link-hover: var(--pink); | |
| --code-bg: var(--current-line); | |
| --code-border: #6272a4; | |
| --toc-bg: #21222c; | |
| --toc-border: var(--current-line); | |
| --blockquote-border: var(--purple); | |
| --table-border: var(--current-line); | |
| } | |
| * { | |
| box-sizing: border-box; | |
| } | |
| body { | |
| font-family: Georgia, 'Times New Roman', serif; | |
| line-height: 1.7; | |
| max-width: 1400px; | |
| margin: 0 auto; | |
| padding: 20px; | |
| background-color: var(--bg-color); | |
| color: var(--text-color); | |
| display: flex; | |
| gap: 2rem; | |
| } | |
| /* Table of Contents */ | |
| #TOC { | |
| position: sticky; | |
| top: 20px; | |
| min-width: 250px; | |
| max-width: 300px; | |
| height: fit-content; | |
| max-height: calc(100vh - 40px); | |
| overflow-y: auto; | |
| background-color: var(--toc-bg); | |
| border: 1px solid var(--toc-border); | |
| border-radius: 8px; | |
| padding: 1.5rem; | |
| font-size: 0.9rem; | |
| } | |
| #TOC ul { | |
| list-style: none; | |
| padding-left: 0; | |
| margin: 0; | |
| } | |
| #TOC > ul > li { | |
| margin-bottom: 0.5rem; | |
| } | |
| #TOC ul ul { | |
| padding-left: 1rem; | |
| margin-top: 0.3rem; | |
| } | |
| #TOC a { | |
| color: var(--link-color); | |
| text-decoration: none; | |
| display: block; | |
| padding: 0.2rem 0; | |
| transition: all 0.2s; | |
| } | |
| #TOC a:hover { | |
| color: var(--link-hover); | |
| transform: translateX(4px); | |
| } | |
| /* Main content */ | |
| main { | |
| flex: 1; | |
| min-width: 0; | |
| max-width: 800px; | |
| } | |
| h1, h2, h3, h4, h5, h6 { | |
| color: var(--heading-color); | |
| margin-top: 2rem; | |
| margin-bottom: 1rem; | |
| line-height: 1.3; | |
| font-weight: 600; | |
| } | |
| h1 { | |
| font-size: 2.5rem; | |
| border-bottom: 2px solid var(--purple); | |
| padding-bottom: 0.5rem; | |
| color: var(--pink); | |
| } | |
| h2 { | |
| font-size: 2rem; | |
| border-bottom: 1px solid var(--current-line); | |
| padding-bottom: 0.3rem; | |
| } | |
| h3 { font-size: 1.5rem; color: var(--cyan); } | |
| h4 { font-size: 1.25rem; color: var(--green); } | |
| h5 { font-size: 1.1rem; color: var(--orange); } | |
| h6 { font-size: 1rem; color: var(--yellow); } | |
| a { | |
| color: var(--link-color); | |
| text-decoration: none; | |
| transition: color 0.2s; | |
| } | |
| a:hover { | |
| color: var(--link-hover); | |
| text-decoration: underline; | |
| } | |
| /* Code blocks */ | |
| pre { | |
| background-color: var(--code-bg); | |
| border: 1px solid var(--code-border); | |
| border-radius: 6px; | |
| padding: 1rem; | |
| overflow-x: auto; | |
| font-size: 0.9rem; | |
| } | |
| code { | |
| font-family: 'Consolas', 'Monaco', 'Courier New', monospace; | |
| background-color: var(--code-bg); | |
| padding: 0.2rem 0.4rem; | |
| border-radius: 3px; | |
| font-size: 0.9em; | |
| color: var(--pink); | |
| } | |
| pre code { | |
| background-color: transparent; | |
| padding: 0; | |
| color: var(--foreground); | |
| } | |
| /* Blockquotes */ | |
| blockquote { | |
| margin: 1.5rem 0; | |
| padding: 1rem; | |
| padding-left: 1.5rem; | |
| border-left: 4px solid var(--blockquote-border); | |
| background-color: rgba(189, 147, 249, 0.1); | |
| color: var(--foreground); | |
| font-style: italic; | |
| border-radius: 0 6px 6px 0; | |
| } | |
| blockquote p { | |
| margin: 0.5rem 0; | |
| } | |
| /* Tables */ | |
| table { | |
| border-collapse: collapse; | |
| width: 100%; | |
| margin: 1.5rem 0; | |
| } | |
| table th, | |
| table td { | |
| border: 1px solid var(--table-border); | |
| padding: 0.75rem; | |
| text-align: left; | |
| } | |
| table th { | |
| background-color: var(--code-bg); | |
| color: var(--purple); | |
| font-weight: bold; | |
| } | |
| table tr:nth-child(even) { | |
| background-color: rgba(68, 71, 90, 0.3); | |
| } | |
| table tr:hover { | |
| background-color: rgba(68, 71, 90, 0.5); | |
| } | |
| /* Images */ | |
| img { | |
| max-width: 100%; | |
| height: auto; | |
| border-radius: 6px; | |
| border: 1px solid var(--current-line); | |
| } | |
| /* Lists */ | |
| ul, ol { | |
| margin: 1rem 0; | |
| padding-left: 2rem; | |
| } | |
| li { | |
| margin: 0.5rem 0; | |
| } | |
| li::marker { | |
| color: var(--purple); | |
| } | |
| /* Horizontal rule */ | |
| hr { | |
| border: none; | |
| border-top: 2px solid var(--current-line); | |
| margin: 2rem 0; | |
| } | |
| /* Inline emphasis */ | |
| strong { | |
| color: var(--orange); | |
| font-weight: bold; | |
| } | |
| em { | |
| color: var(--yellow); | |
| font-style: italic; | |
| } | |
| /* Selection */ | |
| ::selection { | |
| background-color: var(--selection); | |
| color: var(--foreground); | |
| } | |
| /* Responsive design */ | |
| @media (max-width: 900px) { | |
| body { | |
| flex-direction: column; | |
| } | |
| #TOC { | |
| position: static; | |
| max-width: 100%; | |
| max-height: 300px; | |
| margin-bottom: 2rem; | |
| } | |
| main { | |
| max-width: 100%; | |
| } | |
| } | |
| /* Scrollbar styling */ | |
| ::-webkit-scrollbar { | |
| width: 12px; | |
| } | |
| ::-webkit-scrollbar-track { | |
| background: var(--bg-color); | |
| } | |
| ::-webkit-scrollbar-thumb { | |
| background: var(--current-line); | |
| border-radius: 6px; | |
| } | |
| ::-webkit-scrollbar-thumb:hover { | |
| background: var(--comment); | |
| } | |
| /* Title block */ | |
| #title-block-header { | |
| margin-bottom: 3rem; | |
| padding-bottom: 1rem; | |
| border-bottom: 2px solid var(--purple); | |
| } | |
| #title-block-header .title { | |
| margin-top: 0; | |
| color: var(--pink); | |
| } | |
| #title-block-header .subtitle { | |
| color: var(--purple); | |
| font-size: 1.3rem; | |
| margin: 0.5rem 0; | |
| } | |
| #title-block-header .author { | |
| color: var(--cyan); | |
| margin: 0.3rem 0; | |
| } | |
| #title-block-header .date { | |
| color: var(--comment); | |
| font-size: 0.9rem; | |
| margin: 0.3rem 0; | |
| } | |
| </style> | |
| <style> | |
| html { -webkit-text-size-adjust: 100%; } | |
| pre > code.sourceCode { white-space: pre; position: relative; } | |
| pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } | |
| pre > code.sourceCode > span:empty { height: 1.2em; } | |
| .sourceCode { overflow: visible; } | |
| code.sourceCode > span { color: inherit; text-decoration: inherit; } | |
| div.sourceCode { margin: 1em 0; } | |
| pre.sourceCode { margin: 0; } | |
| @media screen { | |
| div.sourceCode { overflow: auto; } | |
| } | |
| @media print { | |
| pre > code.sourceCode { white-space: pre-wrap; } | |
| pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } | |
| } | |
| pre.numberSource code | |
| { counter-reset: source-line 0; } | |
| pre.numberSource code > span | |
| { position: relative; left: -4em; counter-increment: source-line; } | |
| pre.numberSource code > span > a:first-child::before | |
| { content: counter(source-line); | |
| position: relative; left: -1em; text-align: right; vertical-align: baseline; | |
| border: none; display: inline-block; | |
| -webkit-touch-callout: none; -webkit-user-select: none; | |
| -khtml-user-select: none; -moz-user-select: none; | |
| -ms-user-select: none; user-select: none; | |
| padding: 0 4px; width: 4em; | |
| color: #aaaaaa; | |
| } | |
| pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } | |
| div.sourceCode | |
| { background-color: #f8f8f8; } | |
| @media screen { | |
| pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } | |
| } | |
| code span.al { color: #ef2929; } /* Alert */ | |
| code span.an { color: #8f5902; font-weight: bold; font-style: italic; } /* Annotation */ | |
| code span.at { color: #204a87; } /* Attribute */ | |
| code span.bn { color: #0000cf; } /* BaseN */ | |
| code span.cf { color: #204a87; font-weight: bold; } /* ControlFlow */ | |
| code span.ch { color: #4e9a06; } /* Char */ | |
| code span.cn { color: #8f5902; } /* Constant */ | |
| code span.co { color: #8f5902; font-style: italic; } /* Comment */ | |
| code span.cv { color: #8f5902; font-weight: bold; font-style: italic; } /* CommentVar */ | |
| code span.do { color: #8f5902; font-weight: bold; font-style: italic; } /* Documentation */ | |
| code span.dt { color: #204a87; } /* DataType */ | |
| code span.dv { color: #0000cf; } /* DecVal */ | |
| code span.er { color: #a40000; font-weight: bold; } /* Error */ | |
| code span.ex { } /* Extension */ | |
| code span.fl { color: #0000cf; } /* Float */ | |
| code span.fu { color: #204a87; font-weight: bold; } /* Function */ | |
| code span.im { } /* Import */ | |
| code span.in { color: #8f5902; font-weight: bold; font-style: italic; } /* Information */ | |
| code span.kw { color: #204a87; font-weight: bold; } /* Keyword */ | |
| code span.op { color: #ce5c00; font-weight: bold; } /* Operator */ | |
| code span.ot { color: #8f5902; } /* Other */ | |
| code span.pp { color: #8f5902; font-style: italic; } /* Preprocessor */ | |
| code span.sc { color: #ce5c00; font-weight: bold; } /* SpecialChar */ | |
| code span.ss { color: #4e9a06; } /* SpecialString */ | |
| code span.st { color: #4e9a06; } /* String */ | |
| code span.va { color: #000000; } /* Variable */ | |
| code span.vs { color: #4e9a06; } /* VerbatimString */ | |
| code span.wa { color: #8f5902; font-weight: bold; font-style: italic; } /* Warning */ | |
| </style> | |
| </head> | |
| <body> | |
| <nav id="TOC" role="doc-toc"> | |
| <ul> | |
| <li><a href="#chapter-1-introduction-to-x86-64-architecture" | |
| id="toc-chapter-1-introduction-to-x86-64-architecture"><strong>Chapter | |
| 1: Introduction to x86-64 Architecture</strong></a> | |
| <ul> | |
| <li><a href="#evolution-from-8086-to-x86-64" | |
| id="toc-evolution-from-8086-to-x86-64"><strong>1.1 Evolution from 8086 | |
| to x86-64</strong></a> | |
| <ul> | |
| <li><a href="#the-journey-from-16-bit-to-64-bit" | |
| id="toc-the-journey-from-16-bit-to-64-bit"><strong>The Journey from | |
| 16-bit to 64-bit</strong></a></li> | |
| <li><a href="#the-32-bit-revolution-80386-and-ia-32" | |
| id="toc-the-32-bit-revolution-80386-and-ia-32"><strong>The 32-bit | |
| Revolution: 80386 and IA-32</strong></a></li> | |
| <li><a href="#the-64-bit-extension-amd64-and-intel-64" | |
| id="toc-the-64-bit-extension-amd64-and-intel-64"><strong>The 64-bit | |
| Extension: AMD64 and Intel 64</strong></a></li> | |
| <li><a href="#compiler-perspective-evolutionary-complexity" | |
| id="toc-compiler-perspective-evolutionary-complexity"><strong>Compiler | |
| Perspective: Evolutionary Complexity</strong></a></li> | |
| </ul></li> | |
| <li><a href="#x86-64-execution-environment-and-modes" | |
| id="toc-x86-64-execution-environment-and-modes"><strong>1.2 x86-64 | |
| Execution Environment and Modes</strong></a> | |
| <ul> | |
| <li><a href="#operating-modes" | |
| id="toc-operating-modes"><strong>Operating Modes</strong></a></li> | |
| <li><a href="#execution-state" | |
| id="toc-execution-state"><strong>Execution State</strong></a></li> | |
| <li><a href="#privilege-levels-and-protection" | |
| id="toc-privilege-levels-and-protection"><strong>Privilege Levels and | |
| Protection</strong></a></li> | |
| </ul></li> | |
| <li><a | |
| href="#register-architecture-general-purpose-segment-and-system-registers" | |
| id="toc-register-architecture-general-purpose-segment-and-system-registers"><strong>1.3 | |
| Register Architecture: General Purpose, Segment, and System | |
| Registers</strong></a> | |
| <ul> | |
| <li><a href="#general-purpose-registers" | |
| id="toc-general-purpose-registers"><strong>General-Purpose | |
| Registers</strong></a></li> | |
| <li><a href="#special-purpose-registers" | |
| id="toc-special-purpose-registers"><strong>Special-Purpose | |
| Registers</strong></a></li> | |
| <li><a href="#segment-registers-in-64-bit-mode" | |
| id="toc-segment-registers-in-64-bit-mode"><strong>Segment Registers in | |
| 64-bit Mode</strong></a></li> | |
| <li><a href="#control-registers" | |
| id="toc-control-registers"><strong>Control Registers</strong></a></li> | |
| <li><a href="#model-specific-registers-msrs" | |
| id="toc-model-specific-registers-msrs"><strong>Model-Specific Registers | |
| (MSRs)</strong></a></li> | |
| <li><a href="#compiler-register-usage-conventions" | |
| id="toc-compiler-register-usage-conventions"><strong>Compiler Register | |
| Usage Conventions</strong></a></li> | |
| </ul></li> | |
| <li><a href="#memory-models-and-addressing" | |
| id="toc-memory-models-and-addressing"><strong>1.4 Memory Models and | |
| Addressing</strong></a> | |
| <ul> | |
| <li><a href="#virtual-address-space" | |
| id="toc-virtual-address-space"><strong>Virtual Address | |
| Space</strong></a></li> | |
| <li><a href="#memory-segmentation-in-64-bit-mode" | |
| id="toc-memory-segmentation-in-64-bit-mode"><strong>Memory Segmentation | |
| in 64-bit Mode</strong></a></li> | |
| <li><a href="#addressing-modes" | |
| id="toc-addressing-modes"><strong>Addressing Modes</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a | |
| href="#chapter-2-x86-64-instruction-set-architecture-fundamentals" | |
| id="toc-chapter-2-x86-64-instruction-set-architecture-fundamentals"><strong>Chapter | |
| 2: x86-64 Instruction Set Architecture Fundamentals</strong></a> | |
| <ul> | |
| <li><a href="#instruction-format-and-prefixes-rex-vex-evex" | |
| id="toc-instruction-format-and-prefixes-rex-vex-evex"><strong>2.1 | |
| Instruction Format and Prefixes (REX, VEX, EVEX)</strong></a> | |
| <ul> | |
| <li><a href="#basic-instruction-format" | |
| id="toc-basic-instruction-format"><strong>Basic Instruction | |
| Format</strong></a></li> | |
| <li><a href="#legacy-prefixes" id="toc-legacy-prefixes"><strong>Legacy | |
| Prefixes</strong></a></li> | |
| <li><a href="#rex-prefix" id="toc-rex-prefix"><strong>REX | |
| Prefix</strong></a></li> | |
| <li><a href="#vex-prefix-avx" id="toc-vex-prefix-avx"><strong>VEX Prefix | |
| (AVX)</strong></a></li> | |
| <li><a href="#evex-prefix-avx-512" | |
| id="toc-evex-prefix-avx-512"><strong>EVEX Prefix | |
| (AVX-512)</strong></a></li> | |
| <li><a href="#compiler-encoding-decisions" | |
| id="toc-compiler-encoding-decisions"><strong>Compiler Encoding | |
| Decisions</strong></a></li> | |
| </ul></li> | |
| <li><a href="#data-movement-instructions" | |
| id="toc-data-movement-instructions"><strong>2.2 Data Movement | |
| Instructions</strong></a> | |
| <ul> | |
| <li><a href="#basic-move-instructions" | |
| id="toc-basic-move-instructions"><strong>Basic Move | |
| Instructions</strong></a></li> | |
| <li><a href="#zero-and-sign-extension" | |
| id="toc-zero-and-sign-extension"><strong>Zero and Sign | |
| Extension</strong></a></li> | |
| <li><a href="#conditional-moves" | |
| id="toc-conditional-moves"><strong>Conditional Moves</strong></a></li> | |
| <li><a href="#special-data-movement" | |
| id="toc-special-data-movement"><strong>Special Data | |
| Movement</strong></a></li> | |
| <li><a href="#compiler-optimization-patterns" | |
| id="toc-compiler-optimization-patterns"><strong>Compiler Optimization | |
| Patterns</strong></a></li> | |
| </ul></li> | |
| <li><a href="#arithmetic-and-logic-operations" | |
| id="toc-arithmetic-and-logic-operations"><strong>2.3 Arithmetic and | |
| Logic Operations</strong></a> | |
| <ul> | |
| <li><a href="#integer-arithmetic" | |
| id="toc-integer-arithmetic"><strong>Integer Arithmetic</strong></a></li> | |
| <li><a href="#logical-operations" | |
| id="toc-logical-operations"><strong>Logical Operations</strong></a></li> | |
| <li><a href="#flag-manipulation" id="toc-flag-manipulation"><strong>Flag | |
| Manipulation</strong></a></li> | |
| </ul></li> | |
| <li><a href="#bit-manipulation-and-shifts" | |
| id="toc-bit-manipulation-and-shifts"><strong>2.4 Bit Manipulation and | |
| Shifts</strong></a> | |
| <ul> | |
| <li><a href="#shift-operations" id="toc-shift-operations"><strong>Shift | |
| Operations</strong></a></li> | |
| <li><a href="#bit-scanning-and-manipulation" | |
| id="toc-bit-scanning-and-manipulation"><strong>Bit Scanning and | |
| Manipulation</strong></a></li> | |
| <li><a href="#compiler-bit-manipulation-patterns" | |
| id="toc-compiler-bit-manipulation-patterns"><strong>Compiler Bit | |
| Manipulation Patterns</strong></a></li> | |
| </ul></li> | |
| <li><a href="#control-flow-branches-loops-and-calls" | |
| id="toc-control-flow-branches-loops-and-calls"><strong>2.5 Control Flow: | |
| Branches, Loops, and Calls</strong></a> | |
| <ul> | |
| <li><a href="#unconditional-jumps" | |
| id="toc-unconditional-jumps"><strong>Unconditional | |
| Jumps</strong></a></li> | |
| <li><a href="#conditional-branches" | |
| id="toc-conditional-branches"><strong>Conditional | |
| Branches</strong></a></li> | |
| <li><a href="#loop-instructions" id="toc-loop-instructions"><strong>Loop | |
| Instructions</strong></a></li> | |
| <li><a href="#compiler-control-flow-patterns" | |
| id="toc-compiler-control-flow-patterns"><strong>Compiler Control Flow | |
| Patterns</strong></a></li> | |
| <li><a href="#branch-prediction-considerations" | |
| id="toc-branch-prediction-considerations"><strong>Branch Prediction | |
| Considerations</strong></a></li> | |
| </ul></li> | |
| <li><a href="#string-operations" id="toc-string-operations"><strong>2.6 | |
| String Operations</strong></a> | |
| <ul> | |
| <li><a href="#basic-string-instructions" | |
| id="toc-basic-string-instructions"><strong>Basic String | |
| Instructions</strong></a></li> | |
| <li><a href="#rep-prefixes" id="toc-rep-prefixes"><strong>REP | |
| Prefixes</strong></a></li> | |
| <li><a href="#optimized-string-operations" | |
| id="toc-optimized-string-operations"><strong>Optimized String | |
| Operations</strong></a></li> | |
| <li><a href="#compiler-string-intrinsics" | |
| id="toc-compiler-string-intrinsics"><strong>Compiler String | |
| Intrinsics</strong></a></li> | |
| </ul></li> | |
| <li><a href="#compiler-perspective-instruction-selection-patterns" | |
| id="toc-compiler-perspective-instruction-selection-patterns"><strong>2.7 | |
| Compiler Perspective: Instruction Selection Patterns</strong></a> | |
| <ul> | |
| <li><a href="#instruction-selection-overview" | |
| id="toc-instruction-selection-overview"><strong>Instruction Selection | |
| Overview</strong></a></li> | |
| <li><a href="#common-optimization-patterns" | |
| id="toc-common-optimization-patterns"><strong>Common Optimization | |
| Patterns</strong></a></li> | |
| <li><a href="#peephole-optimizations" | |
| id="toc-peephole-optimizations"><strong>Peephole | |
| Optimizations</strong></a></li> | |
| <li><a href="#code-generation-examples" | |
| id="toc-code-generation-examples"><strong>Code Generation | |
| Examples</strong></a></li> | |
| <li><a href="#compiler-instruction-costs" | |
| id="toc-compiler-instruction-costs"><strong>Compiler Instruction | |
| Costs</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-3-memory-architecture-and-addressing-modes" | |
| id="toc-chapter-3-memory-architecture-and-addressing-modes"><strong>Chapter | |
| 3: Memory Architecture and Addressing Modes</strong></a> | |
| <ul> | |
| <li><a href="#x86-64-memory-organization" | |
| id="toc-x86-64-memory-organization"><strong>3.1 x86-64 Memory | |
| Organization</strong></a> | |
| <ul> | |
| <li><a href="#virtual-address-space-layout" | |
| id="toc-virtual-address-space-layout"><strong>Virtual Address Space | |
| Layout</strong></a></li> | |
| <li><a href="#memory-segmentation-in-64-bit-mode-1" | |
| id="toc-memory-segmentation-in-64-bit-mode-1"><strong>Memory | |
| Segmentation in 64-bit Mode</strong></a></li> | |
| <li><a href="#page-table-structure" | |
| id="toc-page-table-structure"><strong>Page Table | |
| Structure</strong></a></li> | |
| <li><a href="#memory-types-and-caching" | |
| id="toc-memory-types-and-caching"><strong>Memory Types and | |
| Caching</strong></a></li> | |
| </ul></li> | |
| <li><a href="#complex-addressing-modes" | |
| id="toc-complex-addressing-modes"><strong>3.2 Complex Addressing | |
| Modes</strong></a> | |
| <ul> | |
| <li><a href="#general-addressing-mode-format" | |
| id="toc-general-addressing-mode-format"><strong>General Addressing Mode | |
| Format</strong></a></li> | |
| <li><a href="#addressing-mode-examples" | |
| id="toc-addressing-mode-examples"><strong>Addressing Mode | |
| Examples</strong></a></li> | |
| <li><a href="#rip-relative-addressing" | |
| id="toc-rip-relative-addressing"><strong>RIP-Relative | |
| Addressing</strong></a></li> | |
| <li><a href="#addressing-mode-encoding" | |
| id="toc-addressing-mode-encoding"><strong>Addressing Mode | |
| Encoding</strong></a></li> | |
| </ul></li> | |
| <li><a href="#memory-access-patterns-and-optimization" | |
| id="toc-memory-access-patterns-and-optimization"><strong>3.3 Memory | |
| Access Patterns and Optimization</strong></a> | |
| <ul> | |
| <li><a href="#cache-friendly-access-patterns" | |
| id="toc-cache-friendly-access-patterns"><strong>Cache-Friendly Access | |
| Patterns</strong></a></li> | |
| <li><a href="#prefetching" | |
| id="toc-prefetching"><strong>Prefetching</strong></a></li> | |
| <li><a href="#non-temporal-memory-access" | |
| id="toc-non-temporal-memory-access"><strong>Non-Temporal Memory | |
| Access</strong></a></li> | |
| </ul></li> | |
| <li><a href="#stack-operations-and-management" | |
| id="toc-stack-operations-and-management"><strong>3.4 Stack Operations | |
| and Management</strong></a> | |
| <ul> | |
| <li><a href="#stack-frame-layout" | |
| id="toc-stack-frame-layout"><strong>Stack Frame Layout</strong></a></li> | |
| <li><a href="#stack-alignment" id="toc-stack-alignment"><strong>Stack | |
| Alignment</strong></a></li> | |
| <li><a href="#red-zone" id="toc-red-zone"><strong>Red | |
| Zone</strong></a></li> | |
| </ul></li> | |
| <li><a href="#memory-barriers-and-atomics" | |
| id="toc-memory-barriers-and-atomics"><strong>3.5 Memory Barriers and | |
| Atomics</strong></a> | |
| <ul> | |
| <li><a href="#memory-ordering" id="toc-memory-ordering"><strong>Memory | |
| Ordering</strong></a></li> | |
| <li><a href="#atomic-operations" | |
| id="toc-atomic-operations"><strong>Atomic Operations</strong></a></li> | |
| <li><a href="#transactional-memory-tsx" | |
| id="toc-transactional-memory-tsx"><strong>Transactional Memory | |
| (TSX)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#effective-address-calculation-lea" | |
| id="toc-effective-address-calculation-lea"><strong>3.6 Effective Address | |
| Calculation (LEA)</strong></a> | |
| <ul> | |
| <li><a href="#lea-instruction-capabilities" | |
| id="toc-lea-instruction-capabilities"><strong>LEA Instruction | |
| Capabilities</strong></a></li> | |
| <li><a href="#compiler-lea-patterns" | |
| id="toc-compiler-lea-patterns"><strong>Compiler LEA | |
| Patterns</strong></a></li> | |
| <li><a href="#lea-vs-other-instructions" | |
| id="toc-lea-vs-other-instructions"><strong>LEA vs Other | |
| Instructions</strong></a></li> | |
| </ul></li> | |
| <li><a href="#compiler-memory-optimization-strategies" | |
| id="toc-compiler-memory-optimization-strategies"><strong>3.7 Compiler | |
| Memory Optimization Strategies</strong></a> | |
| <ul> | |
| <li><a href="#structure-layout-and-padding" | |
| id="toc-structure-layout-and-padding"><strong>Structure Layout and | |
| Padding</strong></a></li> | |
| <li><a href="#loop-optimization-and-memory-access" | |
| id="toc-loop-optimization-and-memory-access"><strong>Loop Optimization | |
| and Memory Access</strong></a></li> | |
| <li><a href="#alias-analysis-and-optimization" | |
| id="toc-alias-analysis-and-optimization"><strong>Alias Analysis and | |
| Optimization</strong></a></li> | |
| <li><a href="#memory-access-coalescing" | |
| id="toc-memory-access-coalescing"><strong>Memory Access | |
| Coalescing</strong></a></li> | |
| <li><a href="#summary-and-key-takeaways" | |
| id="toc-summary-and-key-takeaways"><strong>Summary and Key | |
| Takeaways</strong></a></li> | |
| <li><a href="#looking-ahead" id="toc-looking-ahead"><strong>Looking | |
| Ahead</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-4-stack-operations-and-calling-conventions" | |
| id="toc-chapter-4-stack-operations-and-calling-conventions"><strong>Chapter | |
| 4: Stack Operations and Calling Conventions</strong></a> | |
| <ul> | |
| <li><a href="#stack-architecture-fundamentals" | |
| id="toc-stack-architecture-fundamentals"><strong>4.1 Stack Architecture | |
| Fundamentals</strong></a> | |
| <ul> | |
| <li><a href="#stack-layout-and-growth-direction" | |
| id="toc-stack-layout-and-growth-direction"><strong>Stack Layout and | |
| Growth Direction</strong></a></li> | |
| <li><a href="#stack-pointer-alignment-requirements" | |
| id="toc-stack-pointer-alignment-requirements"><strong>Stack Pointer | |
| Alignment Requirements</strong></a></li> | |
| <li><a href="#stack-frame-structure" | |
| id="toc-stack-frame-structure"><strong>Stack Frame | |
| Structure</strong></a></li> | |
| </ul></li> | |
| <li><a href="#system-v-amd64-abi" | |
| id="toc-system-v-amd64-abi"><strong>4.2 System V AMD64 ABI</strong></a> | |
| <ul> | |
| <li><a href="#register-usage-convention" | |
| id="toc-register-usage-convention"><strong>Register Usage | |
| Convention</strong></a></li> | |
| <li><a href="#function-calling-examples" | |
| id="toc-function-calling-examples"><strong>Function Calling | |
| Examples</strong></a></li> | |
| <li><a href="#floating-point-and-mixed-arguments" | |
| id="toc-floating-point-and-mixed-arguments"><strong>Floating-Point and | |
| Mixed Arguments</strong></a></li> | |
| <li><a href="#red-zone-usage" id="toc-red-zone-usage"><strong>Red Zone | |
| Usage</strong></a></li> | |
| <li><a href="#variable-argument-functions" | |
| id="toc-variable-argument-functions"><strong>Variable Argument | |
| Functions</strong></a></li> | |
| </ul></li> | |
| <li><a href="#microsoft-x64-abi" id="toc-microsoft-x64-abi"><strong>4.3 | |
| Microsoft x64 ABI</strong></a> | |
| <ul> | |
| <li><a href="#register-convention-differences" | |
| id="toc-register-convention-differences"><strong>Register Convention | |
| Differences</strong></a></li> | |
| <li><a href="#function-prologue-and-epilogue-windows" | |
| id="toc-function-prologue-and-epilogue-windows"><strong>Function | |
| Prologue and Epilogue (Windows)</strong></a></li> | |
| <li><a href="#floating-point-parameter-passing-windows" | |
| id="toc-floating-point-parameter-passing-windows"><strong>Floating-Point | |
| Parameter Passing (Windows)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#stack-frame-management" | |
| id="toc-stack-frame-management"><strong>4.4 Stack Frame | |
| Management</strong></a> | |
| <ul> | |
| <li><a href="#frame-pointer-vs-frame-pointer-omission" | |
| id="toc-frame-pointer-vs-frame-pointer-omission"><strong>Frame Pointer | |
| vs Frame Pointer Omission</strong></a></li> | |
| <li><a href="#dynamic-stack-allocation-alloca" | |
| id="toc-dynamic-stack-allocation-alloca"><strong>Dynamic Stack | |
| Allocation (alloca)</strong></a></li> | |
| <li><a href="#stack-unwinding-support" | |
| id="toc-stack-unwinding-support"><strong>Stack Unwinding | |
| Support</strong></a></li> | |
| </ul></li> | |
| <li><a href="#leaf-vs-non-leaf-functions" | |
| id="toc-leaf-vs-non-leaf-functions"><strong>4.5 Leaf vs Non-Leaf | |
| Functions</strong></a> | |
| <ul> | |
| <li><a href="#leaf-function-optimization" | |
| id="toc-leaf-function-optimization"><strong>Leaf Function | |
| Optimization</strong></a></li> | |
| <li><a href="#tail-call-optimization" | |
| id="toc-tail-call-optimization"><strong>Tail Call | |
| Optimization</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-5-exception-handling-and-stack-unwinding" | |
| id="toc-chapter-5-exception-handling-and-stack-unwinding"><strong>Chapter | |
| 5: Exception Handling and Stack Unwinding</strong></a> | |
| <ul> | |
| <li><a href="#exception-handling-fundamentals" | |
| id="toc-exception-handling-fundamentals"><strong>5.1 Exception Handling | |
| Fundamentals</strong></a> | |
| <ul> | |
| <li><a href="#types-of-exceptions-in-x86-64" | |
| id="toc-types-of-exceptions-in-x86-64"><strong>Types of Exceptions in | |
| x86-64</strong></a></li> | |
| <li><a href="#exception-frame-layout" | |
| id="toc-exception-frame-layout"><strong>Exception Frame | |
| Layout</strong></a></li> | |
| </ul></li> | |
| <li><a href="#stack-unwinding-mechanisms" | |
| id="toc-stack-unwinding-mechanisms"><strong>5.2 Stack Unwinding | |
| Mechanisms</strong></a> | |
| <ul> | |
| <li><a href="#dwarf-cfi-call-frame-information" | |
| id="toc-dwarf-cfi-call-frame-information"><strong>DWARF CFI (Call Frame | |
| Information)</strong></a></li> | |
| <li><a href="#manual-stack-walking" | |
| id="toc-manual-stack-walking"><strong>Manual Stack | |
| Walking</strong></a></li> | |
| </ul></li> | |
| <li><a href="#seh-structured-exception-handling-on-windows" | |
| id="toc-seh-structured-exception-handling-on-windows"><strong>5.3 SEH | |
| (Structured Exception Handling) on Windows</strong></a> | |
| <ul> | |
| <li><a href="#seh-frame-setup" id="toc-seh-frame-setup"><strong>SEH | |
| Frame Setup</strong></a></li> | |
| <li><a href="#unwind-information-structure" | |
| id="toc-unwind-information-structure"><strong>Unwind Information | |
| Structure</strong></a></li> | |
| </ul></li> | |
| <li><a href="#c-exception-handling-implementation" | |
| id="toc-c-exception-handling-implementation"><strong>5.4 C++ Exception | |
| Handling Implementation</strong></a> | |
| <ul> | |
| <li><a href="#itanium-abi-exception-model-gccclang" | |
| id="toc-itanium-abi-exception-model-gccclang"><strong>Itanium ABI | |
| Exception Model (GCC/Clang)</strong></a></li> | |
| <li><a href="#raii-and-destructor-calls-during-unwinding" | |
| id="toc-raii-and-destructor-calls-during-unwinding"><strong>RAII and | |
| Destructor Calls During Unwinding</strong></a></li> | |
| </ul></li> | |
| <li><a href="#signal-handling-and-asynchronous-exceptions" | |
| id="toc-signal-handling-and-asynchronous-exceptions"><strong>5.5 Signal | |
| Handling and Asynchronous Exceptions</strong></a> | |
| <ul> | |
| <li><a href="#posix-signal-frame" | |
| id="toc-posix-signal-frame"><strong>POSIX Signal Frame</strong></a></li> | |
| </ul></li> | |
| <li><a href="#stack-unwinding-fundamentals" | |
| id="toc-stack-unwinding-fundamentals"><strong>5.2 Stack Unwinding | |
| Fundamentals</strong></a> | |
| <ul> | |
| <li><a href="#frame-pointer-chaining" | |
| id="toc-frame-pointer-chaining">Frame Pointer Chaining</a></li> | |
| </ul></li> | |
| <li><a href="#dwarf-cfi-system-v-amd64" | |
| id="toc-dwarf-cfi-system-v-amd64"><strong>5.3 DWARF CFI (System V | |
| AMD64)</strong></a></li> | |
| <li><a href="#windows-x64-seh-and-unwind-info" | |
| id="toc-windows-x64-seh-and-unwind-info"><strong>5.4 Windows x64 SEH and | |
| Unwind Info</strong></a></li> | |
| <li><a href="#language-level-exception-flow-itanium-c-abi" | |
| id="toc-language-level-exception-flow-itanium-c-abi"><strong>5.5 | |
| Language-Level Exception Flow (Itanium C++ ABI)</strong></a></li> | |
| <li><a href="#signals-posix-asynchronous-exceptions" | |
| id="toc-signals-posix-asynchronous-exceptions"><strong>5.6 Signals | |
| (POSIX Asynchronous Exceptions)</strong></a></li> | |
| <li><a href="#practical-stack-unwinding-example" | |
| id="toc-practical-stack-unwinding-example"><strong>5.7 Practical Stack | |
| Unwinding Example</strong></a></li> | |
| <li><a href="#key-points" id="toc-key-points"><strong>Key | |
| Points:</strong></a></li> | |
| </ul></li> | |
| <li><a href="#chapter-6-x87-fpu-and-legacy-floating-point" | |
| id="toc-chapter-6-x87-fpu-and-legacy-floating-point"><strong>Chapter 6: | |
| x87 FPU and Legacy Floating Point</strong></a> | |
| <ul> | |
| <li><a href="#x87-fpu-architecture-overview" | |
| id="toc-x87-fpu-architecture-overview"><strong>6.1 x87 FPU Architecture | |
| Overview</strong></a> | |
| <ul> | |
| <li><a href="#x87-register-stack-model" | |
| id="toc-x87-register-stack-model"><strong>x87 Register Stack | |
| Model</strong></a></li> | |
| <li><a href="#x87-control-and-status-words" | |
| id="toc-x87-control-and-status-words"><strong>x87 Control and Status | |
| Words</strong></a></li> | |
| </ul></li> | |
| <li><a href="#x87-instruction-categories" | |
| id="toc-x87-instruction-categories"><strong>6.2 x87 Instruction | |
| Categories</strong></a> | |
| <ul> | |
| <li><a href="#data-transfer-instructions" | |
| id="toc-data-transfer-instructions"><strong>Data Transfer | |
| Instructions</strong></a></li> | |
| <li><a href="#arithmetic-operations" | |
| id="toc-arithmetic-operations"><strong>Arithmetic | |
| Operations</strong></a></li> | |
| <li><a href="#transcendental-functions" | |
| id="toc-transcendental-functions"><strong>Transcendental | |
| Functions</strong></a></li> | |
| </ul></li> | |
| <li><a href="#comparison-and-conditional-operations" | |
| id="toc-comparison-and-conditional-operations"><strong>6.3 Comparison | |
| and Conditional Operations</strong></a> | |
| <ul> | |
| <li><a href="#comparison-instructions" | |
| id="toc-comparison-instructions"><strong>Comparison | |
| Instructions</strong></a></li> | |
| <li><a href="#conditional-move-fcmovcc" | |
| id="toc-conditional-move-fcmovcc"><strong>Conditional Move | |
| (FCMOVcc)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#exception-handling" | |
| id="toc-exception-handling"><strong>6.4 Exception Handling</strong></a> | |
| <ul> | |
| <li><a href="#x87-exception-types" | |
| id="toc-x87-exception-types"><strong>x87 Exception | |
| Types</strong></a></li> | |
| <li><a href="#exception-service" | |
| id="toc-exception-service"><strong>Exception Service</strong></a></li> | |
| <li><a href="#flag-testing-in-software" | |
| id="toc-flag-testing-in-software"><strong>Flag Testing in | |
| Software</strong></a></li> | |
| <li><a href="#cooperating-with-os-exception-models" | |
| id="toc-cooperating-with-os-exception-models"><strong>Cooperating with | |
| OS Exception Models</strong></a></li> | |
| <li><a href="#modern-context-why-care-in-x8664" | |
| id="toc-modern-context-why-care-in-x8664"><strong>6.5 Modern Context: | |
| Why Care in x86‑64</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-7-sse-and-sse2-programming" | |
| id="toc-chapter-7-sse-and-sse2-programming"><strong>Chapter 7: SSE and | |
| SSE2 Programming</strong></a> | |
| <ul> | |
| <li><a href="#ssesse2-architecture-overview" | |
| id="toc-ssesse2-architecture-overview"><strong>7.1 SSE/SSE2 Architecture | |
| Overview</strong></a> | |
| <ul> | |
| <li><a href="#introduction-to-streaming-simd-extensions" | |
| id="toc-introduction-to-streaming-simd-extensions"><strong>Introduction | |
| to Streaming SIMD Extensions</strong></a></li> | |
| <li><a href="#mxcsr-controlstatus-register" | |
| id="toc-mxcsr-controlstatus-register"><strong>MXCSR Control/Status | |
| Register</strong></a></li> | |
| </ul></li> | |
| <li><a href="#sse-floating-point-operations" | |
| id="toc-sse-floating-point-operations"><strong>7.2 SSE Floating-Point | |
| Operations</strong></a> | |
| <ul> | |
| <li><a href="#single-precision-scalar-operations" | |
| id="toc-single-precision-scalar-operations"><strong>Single-Precision | |
| Scalar Operations</strong></a></li> | |
| <li><a href="#single-precision-packed-operations" | |
| id="toc-single-precision-packed-operations"><strong>Single-Precision | |
| Packed Operations</strong></a></li> | |
| <li><a href="#shuffle-and-permute-operations" | |
| id="toc-shuffle-and-permute-operations"><strong>Shuffle and Permute | |
| Operations</strong></a></li> | |
| </ul></li> | |
| <li><a href="#sse2-double-precision-operations" | |
| id="toc-sse2-double-precision-operations"><strong>7.3 SSE2 | |
| Double-Precision Operations</strong></a> | |
| <ul> | |
| <li><a href="#double-precision-scalar-and-packed" | |
| id="toc-double-precision-scalar-and-packed"><strong>Double-Precision | |
| Scalar and Packed</strong></a></li> | |
| </ul></li> | |
| <li><a href="#sse2-integer-operations" | |
| id="toc-sse2-integer-operations"><strong>7.4 SSE2 Integer | |
| Operations</strong></a> | |
| <ul> | |
| <li><a href="#integer-data-movement" | |
| id="toc-integer-data-movement"><strong>Integer Data | |
| Movement</strong></a></li> | |
| <li><a href="#integer-arithmetic-1" | |
| id="toc-integer-arithmetic-1"><strong>Integer | |
| Arithmetic</strong></a></li> | |
| <li><a href="#logical-and-bitwise-operations" | |
| id="toc-logical-and-bitwise-operations"><strong>Logical and Bitwise | |
| Operations</strong></a></li> | |
| <li><a href="#packing-and-unpacking-integers" | |
| id="toc-packing-and-unpacking-integers"><strong>Packing and Unpacking | |
| Integers</strong></a></li> | |
| <li><a href="#conversion-between-integer-and-floating-point" | |
| id="toc-conversion-between-integer-and-floating-point"><strong>Conversion | |
| Between Integer and Floating Point</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-8-advanced-sse-extensions-sse3-ssse3-sse4" | |
| id="toc-chapter-8-advanced-sse-extensions-sse3-ssse3-sse4"><strong>Chapter | |
| 8: Advanced SSE Extensions (SSE3, SSSE3, SSE4)</strong></a> | |
| <ul> | |
| <li><a href="#sse3-extensions" id="toc-sse3-extensions"><strong>8.1 SSE3 | |
| Extensions</strong></a> | |
| <ul> | |
| <li><a href="#horizontal-arithmetic-operations" | |
| id="toc-horizontal-arithmetic-operations"><strong>Horizontal Arithmetic | |
| Operations</strong></a></li> | |
| <li><a href="#special-move-operations" | |
| id="toc-special-move-operations"><strong>Special Move | |
| Operations</strong></a></li> | |
| <li><a href="#x87-fpu-integration-instructions" | |
| id="toc-x87-fpu-integration-instructions"><strong>x87 FPU Integration | |
| Instructions</strong></a></li> | |
| </ul></li> | |
| <li><a href="#ssse3-extensions" id="toc-ssse3-extensions"><strong>8.2 | |
| SSSE3 Extensions</strong></a> | |
| <ul> | |
| <li><a href="#absolute-value-and-sign-operations" | |
| id="toc-absolute-value-and-sign-operations"><strong>Absolute Value and | |
| Sign Operations</strong></a></li> | |
| <li><a href="#horizontal-addition-with-saturation" | |
| id="toc-horizontal-addition-with-saturation"><strong>Horizontal Addition | |
| with Saturation</strong></a></li> | |
| <li><a href="#multiply-and-add-packed" | |
| id="toc-multiply-and-add-packed"><strong>Multiply and Add | |
| Packed</strong></a></li> | |
| <li><a href="#byte-shuffle-pshufb" | |
| id="toc-byte-shuffle-pshufb"><strong>Byte Shuffle | |
| (PSHUFB)</strong></a></li> | |
| <li><a href="#alignment-operations" | |
| id="toc-alignment-operations"><strong>Alignment | |
| Operations</strong></a></li> | |
| </ul></li> | |
| <li><a href="#sse4.1-extensions" id="toc-sse4.1-extensions"><strong>8.3 | |
| SSE4.1 Extensions</strong></a> | |
| <ul> | |
| <li><a href="#blending-operations" | |
| id="toc-blending-operations"><strong>Blending | |
| Operations</strong></a></li> | |
| <li><a href="#dot-product-instructions" | |
| id="toc-dot-product-instructions"><strong>Dot Product | |
| Instructions</strong></a></li> | |
| <li><a href="#rounding-operations" | |
| id="toc-rounding-operations"><strong>Rounding | |
| Operations</strong></a></li> | |
| <li><a href="#integer-minmax-operations" | |
| id="toc-integer-minmax-operations"><strong>Integer Min/Max | |
| Operations</strong></a></li> | |
| <li><a href="#enhanced-integer-operations" | |
| id="toc-enhanced-integer-operations"><strong>Enhanced Integer | |
| Operations</strong></a></li> | |
| </ul></li> | |
| <li><a href="#compiler-mapping-and-usecases" | |
| id="toc-compiler-mapping-and-usecases"><strong>8.5 Compiler Mapping and | |
| Use‑Cases</strong></a></li> | |
| </ul></li> | |
| <li><a href="#chapter-9-avx-and-avx2-vector-extensions" | |
| id="toc-chapter-9-avx-and-avx2-vector-extensions"><strong>Chapter 9: AVX | |
| and AVX2 Vector Extensions</strong></a> | |
| <ul> | |
| <li><a href="#introduction-to-avx-architecture" | |
| id="toc-introduction-to-avx-architecture"><strong>9.1 Introduction to | |
| AVX Architecture</strong></a> | |
| <ul> | |
| <li><a href="#evolution-from-sse-to-avx" | |
| id="toc-evolution-from-sse-to-avx"><strong>Evolution from SSE to | |
| AVX</strong></a></li> | |
| <li><a href="#ymm-register-architecture" | |
| id="toc-ymm-register-architecture"><strong>YMM Register | |
| Architecture</strong></a></li> | |
| <li><a href="#vex-encoding-prefix" | |
| id="toc-vex-encoding-prefix"><strong>VEX Encoding | |
| Prefix</strong></a></li> | |
| <li><a href="#state-management" id="toc-state-management"><strong>State | |
| Management</strong></a></li> | |
| </ul></li> | |
| <li><a href="#avx-floating-point-operations" | |
| id="toc-avx-floating-point-operations"><strong>9.2 AVX Floating-Point | |
| Operations</strong></a> | |
| <ul> | |
| <li><a href="#bit-packed-operations" | |
| id="toc-bit-packed-operations"><strong>256-bit Packed | |
| Operations</strong></a></li> | |
| <li><a href="#comparison-and-masking" | |
| id="toc-comparison-and-masking"><strong>Comparison and | |
| Masking</strong></a></li> | |
| <li><a href="#broadcast-operations" | |
| id="toc-broadcast-operations"><strong>Broadcast | |
| Operations</strong></a></li> | |
| </ul></li> | |
| <li><a href="#avx-permutation-and-shuffle" | |
| id="toc-avx-permutation-and-shuffle"><strong>9.3 AVX Permutation and | |
| Shuffle</strong></a> | |
| <ul> | |
| <li><a href="#cross-lane-permutation" | |
| id="toc-cross-lane-permutation"><strong>Cross-Lane | |
| Permutation</strong></a></li> | |
| <li><a href="#unpack-and-shuffle" | |
| id="toc-unpack-and-shuffle"><strong>Unpack and Shuffle</strong></a></li> | |
| </ul></li> | |
| <li><a href="#avx2-integer-operations" | |
| id="toc-avx2-integer-operations"><strong>9.4 AVX2 Integer | |
| Operations</strong></a> | |
| <ul> | |
| <li><a href="#bit-integer-arithmetic" | |
| id="toc-bit-integer-arithmetic"><strong>256-bit Integer | |
| Arithmetic</strong></a></li> | |
| <li><a href="#gather-operations" | |
| id="toc-gather-operations"><strong>Gather Operations</strong></a></li> | |
| <li><a href="#variable-shifts" id="toc-variable-shifts"><strong>Variable | |
| Shifts</strong></a></li> | |
| <li><a href="#cross-lane-permutation-1" | |
| id="toc-cross-lane-permutation-1"><strong>Cross-Lane | |
| Permutation</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-10-avx-512-and-future-extensions" | |
| id="toc-chapter-10-avx-512-and-future-extensions"><strong>Chapter 10: | |
| AVX-512 and Future Extensions</strong></a> | |
| <ul> | |
| <li><a href="#avx-512-architecture-overview" | |
| id="toc-avx-512-architecture-overview"><strong>10.1 AVX-512 Architecture | |
| Overview</strong></a> | |
| <ul> | |
| <li><a href="#introduction-to-avx-512" | |
| id="toc-introduction-to-avx-512"><strong>Introduction to | |
| AVX-512</strong></a></li> | |
| <li><a href="#evex-encoding-structure" | |
| id="toc-evex-encoding-structure"><strong>EVEX Encoding | |
| Structure</strong></a></li> | |
| <li><a href="#opmask-registers" id="toc-opmask-registers"><strong>Opmask | |
| Registers</strong></a></li> | |
| </ul></li> | |
| <li><a href="#avx-512-foundation-instructions" | |
| id="toc-avx-512-foundation-instructions"><strong>10.2 AVX-512 Foundation | |
| Instructions</strong></a> | |
| <ul> | |
| <li><a href="#bit-arithmetic-operations" | |
| id="toc-bit-arithmetic-operations"><strong>512-bit Arithmetic | |
| Operations</strong></a></li> | |
| <li><a href="#advanced-permutation" | |
| id="toc-advanced-permutation"><strong>Advanced | |
| Permutation</strong></a></li> | |
| <li><a href="#scatter-operations" | |
| id="toc-scatter-operations"><strong>Scatter Operations</strong></a></li> | |
| </ul></li> | |
| <li><a href="#avx-512-extension-sets" | |
| id="toc-avx-512-extension-sets"><strong>10.3 AVX-512 Extension | |
| Sets</strong></a> | |
| <ul> | |
| <li><a href="#avx-512bw-byte-and-word" | |
| id="toc-avx-512bw-byte-and-word"><strong>AVX-512BW (Byte and | |
| Word)</strong></a></li> | |
| <li><a href="#avx-512dq-doubleword-and-quadword" | |
| id="toc-avx-512dq-doubleword-and-quadword"><strong>AVX-512DQ (Doubleword | |
| and Quadword)</strong></a></li> | |
| <li><a href="#avx-512vnni-vector-neural-network-instructions" | |
| id="toc-avx-512vnni-vector-neural-network-instructions"><strong>AVX-512VNNI | |
| (Vector Neural Network Instructions)</strong></a></li> | |
| <li><a href="#avx-512ifma-integer-fused-multiply-add" | |
| id="toc-avx-512ifma-integer-fused-multiply-add"><strong>AVX-512IFMA | |
| (Integer Fused Multiply-Add)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#avx-512-optimization-patterns" | |
| id="toc-avx-512-optimization-patterns"><strong>10.4 AVX-512 Optimization | |
| Patterns</strong></a> | |
| <ul> | |
| <li><a href="#conditional-execution-with-masks" | |
| id="toc-conditional-execution-with-masks"><strong>Conditional Execution | |
| with Masks</strong></a></li> | |
| <li><a href="#vectorizing-loops" | |
| id="toc-vectorizing-loops"><strong>Vectorizing Loops</strong></a></li> | |
| <li><a href="#reduction-strategies" | |
| id="toc-reduction-strategies"><strong>Reduction | |
| Strategies</strong></a></li> | |
| <li><a href="#scatter-gather-performance" | |
| id="toc-scatter-gather-performance"><strong>Scatter & Gather | |
| Performance</strong></a></li> | |
| <li><a href="#evex-broadcast-for-loop-invariants" | |
| id="toc-evex-broadcast-for-loop-invariants"><strong>EVEX Broadcast for | |
| Loop Invariants</strong></a></li> | |
| </ul></li> | |
| <li><a href="#practical-considerations-future-trends" | |
| id="toc-practical-considerations-future-trends"><strong>10.5 Practical | |
| Considerations & Future Trends</strong></a></li> | |
| </ul></li> | |
| <li><a href="#chapter-11-system-level-architecture-and-protection" | |
| id="toc-chapter-11-system-level-architecture-and-protection"><strong>Chapter | |
| 11: System-Level Architecture and Protection</strong></a> | |
| <ul> | |
| <li><a href="#privilege-levels-and-protection-rings" | |
| id="toc-privilege-levels-and-protection-rings"><strong>11.1 Privilege | |
| Levels and Protection Rings</strong></a> | |
| <ul> | |
| <li><a href="#x86-64-protection-model" | |
| id="toc-x86-64-protection-model"><strong>x86-64 Protection | |
| Model</strong></a></li> | |
| <li><a href="#segment-descriptors-and-gates" | |
| id="toc-segment-descriptors-and-gates"><strong>Segment Descriptors and | |
| Gates</strong></a></li> | |
| <li><a href="#global-and-local-descriptor-tables" | |
| id="toc-global-and-local-descriptor-tables"><strong>Global and Local | |
| Descriptor Tables</strong></a></li> | |
| </ul></li> | |
| <li><a href="#control-registers-and-system-structures" | |
| id="toc-control-registers-and-system-structures"><strong>11.2 Control | |
| Registers and System Structures</strong></a> | |
| <ul> | |
| <li><a href="#control-register-programming" | |
| id="toc-control-register-programming"><strong>Control Register | |
| Programming</strong></a></li> | |
| <li><a href="#model-specific-registers-msrs-1" | |
| id="toc-model-specific-registers-msrs-1"><strong>Model-Specific | |
| Registers (MSRs)</strong></a></li> | |
| <li><a href="#task-state-segment-tss" | |
| id="toc-task-state-segment-tss"><strong>Task State Segment | |
| (TSS)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#interrupt-and-exception-handling" | |
| id="toc-interrupt-and-exception-handling"><strong>11.3 Interrupt and | |
| Exception Handling</strong></a> | |
| <ul> | |
| <li><a href="#interrupt-descriptor-table-management" | |
| id="toc-interrupt-descriptor-table-management"><strong>Interrupt | |
| Descriptor Table Management</strong></a></li> | |
| <li><a href="#system-call-mechanisms" | |
| id="toc-system-call-mechanisms"><strong>System Call | |
| Mechanisms</strong></a></li> | |
| </ul></li> | |
| <li><a href="#memory-protection-mechanisms" | |
| id="toc-memory-protection-mechanisms"><strong>11.4 Memory Protection | |
| Mechanisms</strong></a> | |
| <ul> | |
| <li><a href="#page-table-protection-attributes" | |
| id="toc-page-table-protection-attributes"><strong>Page Table Protection | |
| Attributes</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-12-virtual-memory-and-paging-mechanisms" | |
| id="toc-chapter-12-virtual-memory-and-paging-mechanisms"><strong>Chapter | |
| 12: Virtual Memory and Paging Mechanisms</strong></a> | |
| <ul> | |
| <li><a href="#x86-64-paging-architecture" | |
| id="toc-x86-64-paging-architecture"><strong>12.1 x86-64 Paging | |
| Architecture</strong></a> | |
| <ul> | |
| <li><a href="#four-level-page-tables-pml4" | |
| id="toc-four-level-page-tables-pml4"><strong>Four-Level Page Tables | |
| (PML4)</strong></a></li> | |
| <li><a href="#large-pages-2mb-and-1gb" | |
| id="toc-large-pages-2mb-and-1gb"><strong>Large Pages (2MB and | |
| 1GB)</strong></a></li> | |
| <li><a href="#five-level-paging-la57" | |
| id="toc-five-level-paging-la57"><strong>Five-Level Paging | |
| (LA57)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#translation-lookaside-buffer-tlb-management" | |
| id="toc-translation-lookaside-buffer-tlb-management"><strong>12.2 | |
| Translation Lookaside Buffer (TLB) Management</strong></a> | |
| <ul> | |
| <li><a href="#tlb-invalidation-techniques" | |
| id="toc-tlb-invalidation-techniques"><strong>TLB Invalidation | |
| Techniques</strong></a></li> | |
| <li><a href="#page-attribute-table-pat" | |
| id="toc-page-attribute-table-pat"><strong>Page Attribute Table | |
| (PAT)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#memory-protection-extensions" | |
| id="toc-memory-protection-extensions"><strong>12.3 Memory Protection | |
| Extensions</strong></a> | |
| <ul> | |
| <li><a href="#nx-bit-and-dep" id="toc-nx-bit-and-dep"><strong>NX Bit and | |
| DEP</strong></a></li> | |
| <li><a href="#memory-type-range-registers-mtrrs" | |
| id="toc-memory-type-range-registers-mtrrs"><strong>Memory Type Range | |
| Registers (MTRRs)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#virtual-memory-operations" | |
| id="toc-virtual-memory-operations"><strong>12.4 Virtual Memory | |
| Operations</strong></a> | |
| <ul> | |
| <li><a href="#page-fault-handling" | |
| id="toc-page-fault-handling"><strong>Page Fault | |
| Handling</strong></a></li> | |
| <li><a href="#memory-mapping-and-unmapping" | |
| id="toc-memory-mapping-and-unmapping"><strong>Memory Mapping and | |
| Unmapping</strong></a></li> | |
| <li><a href="#copy-on-write-implementation" | |
| id="toc-copy-on-write-implementation"><strong>Copy-on-Write | |
| Implementation</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-13-interrupts-apic-and-multi-core-programming" | |
| id="toc-chapter-13-interrupts-apic-and-multi-core-programming"><strong>Chapter | |
| 13: Interrupts, APIC, and Multi-Core Programming</strong></a> | |
| <ul> | |
| <li><a href="#interrupt-architecture" | |
| id="toc-interrupt-architecture"><strong>13.1 Interrupt | |
| Architecture</strong></a> | |
| <ul> | |
| <li><a href="#interrupt-descriptor-table-idt" | |
| id="toc-interrupt-descriptor-table-idt"><strong>Interrupt Descriptor | |
| Table (IDT)</strong></a></li> | |
| <li><a href="#exception-handling-1" | |
| id="toc-exception-handling-1"><strong>Exception | |
| Handling</strong></a></li> | |
| <li><a href="#hardware-vs-software-interrupts" | |
| id="toc-hardware-vs-software-interrupts"><strong>Hardware vs Software | |
| Interrupts</strong></a></li> | |
| </ul></li> | |
| <li><a href="#advanced-programmable-interrupt-controller-apic" | |
| id="toc-advanced-programmable-interrupt-controller-apic"><strong>13.2 | |
| Advanced Programmable Interrupt Controller (APIC)</strong></a> | |
| <ul> | |
| <li><a href="#local-apic-programming" | |
| id="toc-local-apic-programming"><strong>Local APIC | |
| Programming</strong></a></li> | |
| <li><a href="#io-apic-configuration" | |
| id="toc-io-apic-configuration"><strong>I/O APIC | |
| Configuration</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-14-security-extensions-and-virtualization" | |
| id="toc-chapter-14-security-extensions-and-virtualization"><strong>Chapter | |
| 14: Security Extensions and Virtualization</strong></a> | |
| <ul> | |
| <li><a href="#hardware-assisted-security-features" | |
| id="toc-hardware-assisted-security-features"><strong>14.1 | |
| Hardware-Assisted Security Features</strong></a> | |
| <ul> | |
| <li><a href="#nx-bit-no-execute" id="toc-nx-bit-no-execute"><strong>NX | |
| Bit (No-Execute)</strong></a></li> | |
| <li><a href="#smapsmep-supervisor-mode-accessexecution-prevention" | |
| id="toc-smapsmep-supervisor-mode-accessexecution-prevention"><strong>SMAP/SMEP | |
| (Supervisor Mode Access/Execution Prevention)</strong></a></li> | |
| <li><a href="#intel-cet-control-flow-enforcement-technology" | |
| id="toc-intel-cet-control-flow-enforcement-technology"><strong>Intel CET | |
| (Control-flow Enforcement Technology)</strong></a></li> | |
| <li><a href="#intel-sgx-software-guard-extensions" | |
| id="toc-intel-sgx-software-guard-extensions"><strong>Intel SGX (Software | |
| Guard Extensions)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#virtualization-architecture" | |
| id="toc-virtualization-architecture"><strong>14.2 Virtualization | |
| Architecture</strong></a> | |
| <ul> | |
| <li><a href="#intel-vt-x-vmx-fundamentals" | |
| id="toc-intel-vt-x-vmx-fundamentals"><strong>Intel VT-x (VMX) | |
| Fundamentals</strong></a></li> | |
| <li><a href="#extended-page-tables-ept" | |
| id="toc-extended-page-tables-ept"><strong>Extended Page Tables | |
| (EPT)</strong></a></li> | |
| </ul></li> | |
| <li><a href="#multi-core-and-multi-threading-security" | |
| id="toc-multi-core-and-multi-threading-security"><strong>14.3 Multi-Core | |
| and Multi-Threading Security</strong></a> | |
| <ul> | |
| <li><a href="#per-cpu-security-state" | |
| id="toc-per-cpu-security-state"><strong>Per-CPU Security | |
| State</strong></a></li> | |
| <li><a href="#speculation-control" | |
| id="toc-speculation-control"><strong>Speculation | |
| Control</strong></a></li> | |
| </ul></li> | |
| <li><a href="#secure-coding-practices" | |
| id="toc-secure-coding-practices"><strong>14.4 Secure Coding | |
| Practices</strong></a> | |
| <ul> | |
| <li><a href="#stack-protection" id="toc-stack-protection"><strong>Stack | |
| Protection</strong></a></li> | |
| <li><a href="#secure-memory-operations" | |
| id="toc-secure-memory-operations"><strong>Secure Memory | |
| Operations</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#chapter-15-performance-optimization-techniques" | |
| id="toc-chapter-15-performance-optimization-techniques">Chapter 15: | |
| Performance Optimization Techniques</a> | |
| <ul> | |
| <li><a href="#microarchitectural-optimization-fundamentals" | |
| id="toc-microarchitectural-optimization-fundamentals">15.1 | |
| Microarchitectural Optimization Fundamentals</a> | |
| <ul> | |
| <li><a href="#understanding-the-modern-x86-64-pipeline" | |
| id="toc-understanding-the-modern-x86-64-pipeline">Understanding the | |
| Modern x86-64 Pipeline</a></li> | |
| <li><a href="#execution-ports-and-throughput" | |
| id="toc-execution-ports-and-throughput">Execution Ports and | |
| Throughput</a></li> | |
| </ul></li> | |
| <li><a href="#branch-prediction-optimization" | |
| id="toc-branch-prediction-optimization">15.2 Branch Prediction | |
| Optimization</a> | |
| <ul> | |
| <li><a href="#static-branch-prediction" | |
| id="toc-static-branch-prediction">Static Branch Prediction</a></li> | |
| <li><a href="#loop-optimization-and-unrolling" | |
| id="toc-loop-optimization-and-unrolling">Loop Optimization and | |
| Unrolling</a></li> | |
| </ul></li> | |
| <li><a href="#memory-access-optimization" | |
| id="toc-memory-access-optimization">15.3 Memory Access Optimization</a> | |
| <ul> | |
| <li><a href="#cache-line-optimization" | |
| id="toc-cache-line-optimization">Cache Line Optimization</a></li> | |
| <li><a href="#non-temporal-stores-streaming-stores" | |
| id="toc-non-temporal-stores-streaming-stores">Non-Temporal Stores | |
| (Streaming Stores)</a></li> | |
| </ul></li> | |
| <li><a href="#simd-vectorization-techniques" | |
| id="toc-simd-vectorization-techniques">15.4 SIMD Vectorization | |
| Techniques</a> | |
| <ul> | |
| <li><a href="#auto-vectorization-patterns" | |
| id="toc-auto-vectorization-patterns">Auto-Vectorization | |
| Patterns</a></li> | |
| <li><a href="#fma-fused-multiply-add-optimization" | |
| id="toc-fma-fused-multiply-add-optimization">FMA (Fused Multiply-Add) | |
| Optimization</a></li> | |
| </ul></li> | |
| <li><a href="#instruction-level-parallelism" | |
| id="toc-instruction-level-parallelism">15.5 Instruction-Level | |
| Parallelism</a> | |
| <ul> | |
| <li><a href="#dependency-chain-breaking" | |
| id="toc-dependency-chain-breaking">Dependency Chain Breaking</a></li> | |
| <li><a href="#software-pipelining" id="toc-software-pipelining">Software | |
| Pipelining</a></li> | |
| </ul></li> | |
| <li><a href="#code-size-and-alignment-optimization" | |
| id="toc-code-size-and-alignment-optimization">15.6 Code Size and | |
| Alignment Optimization</a> | |
| <ul> | |
| <li><a href="#function-and-loop-alignment" | |
| id="toc-function-and-loop-alignment">Function and Loop | |
| Alignment</a></li> | |
| <li><a href="#instruction-selection-for-size" | |
| id="toc-instruction-selection-for-size">Instruction Selection for | |
| Size</a></li> | |
| </ul></li> | |
| <li><a href="#profile-guided-optimization" | |
| id="toc-profile-guided-optimization">15.7 Profile-Guided | |
| Optimization</a> | |
| <ul> | |
| <li><a href="#using-performance-counters" | |
| id="toc-using-performance-counters">Using Performance Counters</a></li> | |
| </ul></li> | |
| <li><a href="#practical-optimization-example" | |
| id="toc-practical-optimization-example">15.8 Practical Optimization | |
| Example</a></li> | |
| <li><a href="#performance-analysis-tools" | |
| id="toc-performance-analysis-tools">15.9 Performance Analysis Tools</a> | |
| <ul> | |
| <li><a href="#intel-vtune-profiler-integration" | |
| id="toc-intel-vtune-profiler-integration">Intel VTune Profiler | |
| Integration</a></li> | |
| </ul></li> | |
| <li><a href="#summary" id="toc-summary">Summary</a></li> | |
| <li><a href="#exercises" id="toc-exercises">Exercises</a></li> | |
| </ul></li> | |
| <li><a href="#chapter-16-code-generation-and-compiler-backend" | |
| id="toc-chapter-16-code-generation-and-compiler-backend">Chapter 16: | |
| Code Generation and Compiler Backend</a> | |
| <ul> | |
| <li><a href="#compiler-architecture-overview" | |
| id="toc-compiler-architecture-overview">16.1 Compiler Architecture | |
| Overview</a> | |
| <ul> | |
| <li><a href="#compilation-pipeline" | |
| id="toc-compilation-pipeline">Compilation Pipeline</a></li> | |
| </ul></li> | |
| <li><a href="#register-allocation" id="toc-register-allocation">16.2 | |
| Register Allocation</a> | |
| <ul> | |
| <li><a href="#graph-coloring-algorithm" | |
| id="toc-graph-coloring-algorithm">Graph Coloring Algorithm</a></li> | |
| <li><a href="#spill-code-generation" | |
| id="toc-spill-code-generation">Spill Code Generation</a></li> | |
| </ul></li> | |
| <li><a href="#instruction-selection" id="toc-instruction-selection">16.3 | |
| Instruction Selection</a> | |
| <ul> | |
| <li><a href="#pattern-matching-and-tiling" | |
| id="toc-pattern-matching-and-tiling">Pattern Matching and | |
| Tiling</a></li> | |
| <li><a href="#peephole-optimization" | |
| id="toc-peephole-optimization">Peephole Optimization</a></li> | |
| </ul></li> | |
| <li><a href="#jit-compilation-implementation" | |
| id="toc-jit-compilation-implementation">16.4 JIT Compilation | |
| Implementation</a> | |
| <ul> | |
| <li><a href="#basic-jit-compiler-structure" | |
| id="toc-basic-jit-compiler-structure">Basic JIT Compiler | |
| Structure</a></li> | |
| <li><a href="#advanced-jit-with-templates" | |
| id="toc-advanced-jit-with-templates">Advanced JIT with | |
| Templates</a></li> | |
| </ul></li> | |
| <li><a href="#dynamic-binary-translation" | |
| id="toc-dynamic-binary-translation">16.5 Dynamic Binary Translation</a> | |
| <ul> | |
| <li><a href="#self-modifying-code" | |
| id="toc-self-modifying-code">Self-Modifying Code</a></li> | |
| </ul></li> | |
| <li><a href="#machine-code-encoding" id="toc-machine-code-encoding">16.6 | |
| Machine Code Encoding</a> | |
| <ul> | |
| <li><a href="#x86-64-instruction-encoding" | |
| id="toc-x86-64-instruction-encoding">x86-64 Instruction | |
| Encoding</a></li> | |
| <li><a href="#building-an-assembler" | |
| id="toc-building-an-assembler">Building an Assembler</a></li> | |
| </ul></li> | |
| <li><a href="#optimization-pass-implementation" | |
| id="toc-optimization-pass-implementation">16.7 Optimization Pass | |
| Implementation</a> | |
| <ul> | |
| <li><a href="#dead-code-elimination" id="toc-dead-code-elimination">Dead | |
| Code Elimination</a></li> | |
| <li><a href="#constant-propagation" | |
| id="toc-constant-propagation">Constant Propagation</a></li> | |
| </ul></li> | |
| <li><a href="#llvm-integration" id="toc-llvm-integration">16.8 LLVM | |
| Integration</a> | |
| <ul> | |
| <li><a href="#llvm-ir-to-x86-64" id="toc-llvm-ir-to-x86-64">LLVM IR to | |
| x86-64</a></li> | |
| </ul></li> | |
| <li><a href="#register-allocation-1" id="toc-register-allocation-1">16.2 | |
| Register Allocation</a> | |
| <ul> | |
| <li><a href="#graph-coloring-allocation" | |
| id="toc-graph-coloring-allocation">Graph Coloring Allocation</a></li> | |
| </ul></li> | |
| <li><a href="#instruction-selection-1" | |
| id="toc-instruction-selection-1">16.3 Instruction Selection</a> | |
| <ul> | |
| <li><a href="#matching-and-tiling" id="toc-matching-and-tiling">Matching | |
| and Tiling</a></li> | |
| </ul></li> | |
| <li><a href="#late-stage-peephole-optimization" | |
| id="toc-late-stage-peephole-optimization">16.4 Late-stage (Peephole) | |
| Optimization</a></li> | |
| <li><a href="#jit-compilation" id="toc-jit-compilation">16.5 JIT | |
| Compilation</a></li> | |
| <li><a href="#dynamic-binary-translation-self-modifying-code" | |
| id="toc-dynamic-binary-translation-self-modifying-code">16.6 Dynamic | |
| Binary Translation & Self-modifying Code</a></li> | |
| <li><a href="#machine-code-encoding-1" | |
| id="toc-machine-code-encoding-1">16.7 Machine Code Encoding</a> | |
| <ul> | |
| <li><a href="#encoding-format" id="toc-encoding-format">Encoding | |
| format:</a></li> | |
| </ul></li> | |
| <li><a href="#building-an-assembler-backend-emitter" | |
| id="toc-building-an-assembler-backend-emitter">16.8 Building an | |
| Assembler (Backend-emitter)</a></li> | |
| <li><a href="#backend-optimization-passes" | |
| id="toc-backend-optimization-passes">16.9 Backend Optimization | |
| Passes</a></li> | |
| <li><a href="#llvm-backend-integration" | |
| id="toc-llvm-backend-integration">16.10 LLVM Backend | |
| Integration</a></li> | |
| <li><a href="#summary-1" id="toc-summary-1">Summary</a> | |
| <ul> | |
| <li><a href="#exercises-1" id="toc-exercises-1">Exercises</a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#appendix-a-x86-64-instruction-reference-quick-guide" | |
| id="toc-appendix-a-x86-64-instruction-reference-quick-guide">Appendix A: | |
| x86-64 Instruction Reference Quick Guide</a> | |
| <ul> | |
| <li><a href="#a.1-instruction-format-overview" | |
| id="toc-a.1-instruction-format-overview">A.1 Instruction Format | |
| Overview</a> | |
| <ul> | |
| <li><a href="#general-encoding-structure" | |
| id="toc-general-encoding-structure">General Encoding Structure</a></li> | |
| <li><a href="#rex-prefix-40h-4fh" id="toc-rex-prefix-40h-4fh">REX Prefix | |
| (40h-4Fh)</a></li> | |
| <li><a href="#modrm-byte" id="toc-modrm-byte">ModR/M Byte</a></li> | |
| </ul></li> | |
| <li><a href="#a.2-data-movement-instructions" | |
| id="toc-a.2-data-movement-instructions">A.2 Data Movement | |
| Instructions</a> | |
| <ul> | |
| <li><a href="#basic-moves" id="toc-basic-moves">Basic Moves</a></li> | |
| <li><a href="#stack-operations" id="toc-stack-operations">Stack | |
| Operations</a></li> | |
| <li><a href="#conditional-moves-cmovcc" | |
| id="toc-conditional-moves-cmovcc">Conditional Moves (CMOVcc)</a></li> | |
| </ul></li> | |
| <li><a href="#a.3-arithmetic-instructions" | |
| id="toc-a.3-arithmetic-instructions">A.3 Arithmetic Instructions</a> | |
| <ul> | |
| <li><a href="#integer-arithmetic-2" | |
| id="toc-integer-arithmetic-2">Integer Arithmetic</a></li> | |
| <li><a href="#bcd-and-ascii-adjust-legacy" | |
| id="toc-bcd-and-ascii-adjust-legacy">BCD and ASCII Adjust | |
| (Legacy)</a></li> | |
| </ul></li> | |
| <li><a href="#a.4-logical-instructions" | |
| id="toc-a.4-logical-instructions">A.4 Logical Instructions</a></li> | |
| <li><a href="#a.5-shift-and-rotate-instructions" | |
| id="toc-a.5-shift-and-rotate-instructions">A.5 Shift and Rotate | |
| Instructions</a></li> | |
| <li><a href="#a.6-bit-manipulation-instructions" | |
| id="toc-a.6-bit-manipulation-instructions">A.6 Bit Manipulation | |
| Instructions</a></li> | |
| <li><a href="#a.7-control-transfer-instructions" | |
| id="toc-a.7-control-transfer-instructions">A.7 Control Transfer | |
| Instructions</a> | |
| <ul> | |
| <li><a href="#unconditional-jumps-1" | |
| id="toc-unconditional-jumps-1">Unconditional Jumps</a></li> | |
| <li><a href="#conditional-jumps-jcc" | |
| id="toc-conditional-jumps-jcc">Conditional Jumps (Jcc)</a></li> | |
| <li><a href="#loop-instructions-1" id="toc-loop-instructions-1">Loop | |
| Instructions</a></li> | |
| </ul></li> | |
| <li><a href="#a.8-string-instructions" | |
| id="toc-a.8-string-instructions">A.8 String Instructions</a></li> | |
| <li><a href="#a.9-flag-control-instructions" | |
| id="toc-a.9-flag-control-instructions">A.9 Flag Control | |
| Instructions</a></li> | |
| <li><a href="#a.10-system-instructions" | |
| id="toc-a.10-system-instructions">A.10 System Instructions</a></li> | |
| <li><a href="#a.11-simd-instructions-sseavx" | |
| id="toc-a.11-simd-instructions-sseavx">A.11 SIMD Instructions | |
| (SSE/AVX)</a> | |
| <ul> | |
| <li><a href="#data-movement" id="toc-data-movement">Data | |
| Movement</a></li> | |
| <li><a href="#arithmetic-packed" id="toc-arithmetic-packed">Arithmetic | |
| (Packed)</a></li> | |
| <li><a href="#logical" id="toc-logical">Logical</a></li> | |
| <li><a href="#comparison" id="toc-comparison">Comparison</a></li> | |
| <li><a href="#shufflepermute" | |
| id="toc-shufflepermute">Shuffle/Permute</a></li> | |
| </ul></li> | |
| <li><a href="#a.12-avxavx2-instructions" | |
| id="toc-a.12-avxavx2-instructions">A.12 AVX/AVX2 Instructions</a> | |
| <ul> | |
| <li><a href="#three-operand-form" | |
| id="toc-three-operand-form">Three-Operand Form</a></li> | |
| <li><a href="#fma-fused-multiply-add" | |
| id="toc-fma-fused-multiply-add">FMA (Fused Multiply-Add)</a></li> | |
| <li><a href="#gatherscatter-avx2avx-512" | |
| id="toc-gatherscatter-avx2avx-512">Gather/Scatter | |
| (AVX2/AVX-512)</a></li> | |
| </ul></li> | |
| <li><a href="#a.13-avx-512-instructions" | |
| id="toc-a.13-avx-512-instructions">A.13 AVX-512 Instructions</a> | |
| <ul> | |
| <li><a href="#mask-operations" id="toc-mask-operations">Mask | |
| Operations</a></li> | |
| <li><a href="#masked-operations" id="toc-masked-operations">Masked | |
| Operations</a></li> | |
| <li><a href="#special-avx-512-instructions" | |
| id="toc-special-avx-512-instructions">Special AVX-512 | |
| Instructions</a></li> | |
| </ul></li> | |
| <li><a href="#a.14-transactional-memory-tsx" | |
| id="toc-a.14-transactional-memory-tsx">A.14 Transactional Memory | |
| (TSX)</a></li> | |
| <li><a href="#a.15-security-extensions" | |
| id="toc-a.15-security-extensions">A.15 Security Extensions</a> | |
| <ul> | |
| <li><a href="#intel-cet-control-flow-enforcement" | |
| id="toc-intel-cet-control-flow-enforcement">Intel CET (Control-flow | |
| Enforcement)</a></li> | |
| <li><a href="#intel-sgx" id="toc-intel-sgx">Intel SGX</a></li> | |
| </ul></li> | |
| <li><a href="#a.16-common-instruction-patterns" | |
| id="toc-a.16-common-instruction-patterns">A.16 Common Instruction | |
| Patterns</a> | |
| <ul> | |
| <li><a href="#function-prologueepilogue" | |
| id="toc-function-prologueepilogue">Function Prologue/Epilogue</a></li> | |
| <li><a href="#system-v-amd64-abi-registers" | |
| id="toc-system-v-amd64-abi-registers">System V AMD64 ABI | |
| Registers</a></li> | |
| <li><a href="#windows-x64-abi-registers" | |
| id="toc-windows-x64-abi-registers">Windows x64 ABI Registers</a></li> | |
| </ul></li> | |
| <li><a href="#a.17-optimization-guidelines" | |
| id="toc-a.17-optimization-guidelines">A.17 Optimization Guidelines</a> | |
| <ul> | |
| <li><a href="#alignment" id="toc-alignment">Alignment</a></li> | |
| <li><a href="#instruction-selection-2" | |
| id="toc-instruction-selection-2">Instruction Selection</a></li> | |
| <li><a href="#pipeline-optimization" | |
| id="toc-pipeline-optimization">Pipeline Optimization</a></li> | |
| </ul></li> | |
| <li><a href="#summary-2" id="toc-summary-2">Summary</a></li> | |
| <li><a href="#instruction-extension-quick-map" | |
| id="toc-instruction-extension-quick-map"><strong>Instruction & | |
| Extension Quick Map</strong></a> | |
| <ul> | |
| <li><a href="#encoding-basics" id="toc-encoding-basics"><strong>Encoding | |
| Basics</strong></a></li> | |
| <li><a href="#scalar-and-general-purpose-ops" | |
| id="toc-scalar-and-general-purpose-ops"><strong>Scalar and General | |
| Purpose Ops</strong></a></li> | |
| <li><a href="#sse3-ssse3-sse4-highlights" | |
| id="toc-sse3-ssse3-sse4-highlights"><strong>SSE3 / SSSE3 / SSE4 | |
| Highlights</strong></a></li> | |
| <li><a href="#avx-avx2" id="toc-avx-avx2"><strong>AVX / | |
| AVX2</strong></a></li> | |
| <li><a href="#avx512" id="toc-avx512"><strong>AVX‑512</strong></a></li> | |
| <li><a href="#systemarch" | |
| id="toc-systemarch"><strong>System/Arch</strong></a></li> | |
| </ul></li> | |
| </ul></li> | |
| <li><a href="#appendix-b-system-v-amd64-abi-summary" | |
| id="toc-appendix-b-system-v-amd64-abi-summary">Appendix B: System V | |
| AMD64 ABI Summary</a> | |
| <ul> | |
| <li><a href="#b.1-register-usage-conventions" | |
| id="toc-b.1-register-usage-conventions">B.1 Register Usage | |
| Conventions</a> | |
| <ul> | |
| <li><a href="#general-purpose-registers-1" | |
| id="toc-general-purpose-registers-1">General Purpose Registers</a></li> | |
| <li><a href="#floating-point-registers" | |
| id="toc-floating-point-registers">Floating-Point Registers</a></li> | |
| <li><a href="#special-registers" id="toc-special-registers">Special | |
| Registers</a></li> | |
| </ul></li> | |
| <li><a href="#b.2-function-calling-convention" | |
| id="toc-b.2-function-calling-convention">B.2 Function Calling | |
| Convention</a> | |
| <ul> | |
| <li><a href="#argument-passing" id="toc-argument-passing">Argument | |
| Passing</a></li> | |
| <li><a href="#classification-rules" | |
| id="toc-classification-rules">Classification Rules</a></li> | |
| <li><a href="#aggregate-structunion-passing" | |
| id="toc-aggregate-structunion-passing">Aggregate (Struct/Union) | |
| Passing</a></li> | |
| <li><a href="#variable-arguments-va_args" | |
| id="toc-variable-arguments-va_args">Variable Arguments | |
| (va_args)</a></li> | |
| </ul></li> | |
| <li><a href="#b.3-stack-frame-layout" | |
| id="toc-b.3-stack-frame-layout">B.3 Stack Frame Layout</a> | |
| <ul> | |
| <li><a href="#stack-organization-high-to-low-address" | |
| id="toc-stack-organization-high-to-low-address">Stack Organization (High | |
| to Low Address)</a></li> | |
| <li><a href="#red-zone-1" id="toc-red-zone-1">Red Zone</a></li> | |
| <li><a href="#stack-alignment-1" id="toc-stack-alignment-1">Stack | |
| Alignment</a></li> | |
| </ul></li> | |
| <li><a href="#b.4-return-values" id="toc-b.4-return-values">B.4 Return | |
| Values</a> | |
| <ul> | |
| <li><a href="#scalar-returns" id="toc-scalar-returns">Scalar | |
| Returns</a></li> | |
| <li><a href="#aggregate-returns" id="toc-aggregate-returns">Aggregate | |
| Returns</a></li> | |
| </ul></li> | |
| <li><a href="#b.5-function-prologue-and-epilogue" | |
| id="toc-b.5-function-prologue-and-epilogue">B.5 Function Prologue and | |
| Epilogue</a> | |
| <ul> | |
| <li><a href="#standard-prologue" id="toc-standard-prologue">Standard | |
| Prologue</a></li> | |
| <li><a href="#standard-epilogue" id="toc-standard-epilogue">Standard | |
| Epilogue</a></li> | |
| <li><a href="#leaf-function-optimization-1" | |
| id="toc-leaf-function-optimization-1">Leaf Function | |
| Optimization</a></li> | |
| </ul></li> | |
| <li><a href="#b.6-system-calls" id="toc-b.6-system-calls">B.6 System | |
| Calls</a> | |
| <ul> | |
| <li><a href="#linux-system-call-convention" | |
| id="toc-linux-system-call-convention">Linux System Call | |
| Convention</a></li> | |
| <li><a href="#system-call-example" id="toc-system-call-example">System | |
| Call Example</a></li> | |
| <li><a href="#common-system-call-numbers" | |
| id="toc-common-system-call-numbers">Common System Call Numbers</a></li> | |
| </ul></li> | |
| <li><a href="#b.7-thread-local-storage-tls" | |
| id="toc-b.7-thread-local-storage-tls">B.7 Thread-Local Storage (TLS)</a> | |
| <ul> | |
| <li><a href="#tls-access-models" id="toc-tls-access-models">TLS Access | |
| Models</a></li> | |
| </ul></li> | |
| <li><a href="#b.8-exception-handling" | |
| id="toc-b.8-exception-handling">B.8 Exception Handling</a> | |
| <ul> | |
| <li><a href="#stack-unwinding-dwarf" | |
| id="toc-stack-unwinding-dwarf">Stack Unwinding (DWARF)</a></li> | |
| <li><a href="#c-exception-handling" id="toc-c-exception-handling">C++ | |
| Exception Handling</a></li> | |
| </ul></li> | |
| <li><a href="#b.9-data-alignment-requirements" | |
| id="toc-b.9-data-alignment-requirements">B.9 Data Alignment | |
| Requirements</a> | |
| <ul> | |
| <li><a href="#structure-padding" id="toc-structure-padding">Structure | |
| Padding</a></li> | |
| </ul></li> | |
| <li><a href="#b.10-executable-file-format-elf" | |
| id="toc-b.10-executable-file-format-elf">B.10 Executable File Format | |
| (ELF)</a> | |
| <ul> | |
| <li><a href="#program-headers" id="toc-program-headers">Program | |
| Headers</a></li> | |
| </ul></li> | |
| <li><a href="#register-usage-and-preservation-rules" | |
| id="toc-register-usage-and-preservation-rules">1️⃣ Register Usage and | |
| Preservation Rules</a></li> | |
| <li><a href="#calling-convention-essentials" | |
| id="toc-calling-convention-essentials">2️⃣ Calling Convention | |
| Essentials</a></li> | |
| <li><a href="#stack-frame-and-alignment" | |
| id="toc-stack-frame-and-alignment">3️⃣ Stack Frame and Alignment</a></li> | |
| <li><a href="#system-calls-linux-amd64" | |
| id="toc-system-calls-linux-amd64">4️⃣ System Calls (Linux AMD64)</a></li> | |
| <li><a href="#data-alignment-rules" id="toc-data-alignment-rules">5️⃣ | |
| Data Alignment Rules</a></li> | |
| <li><a href="#threadlocal-storage-tls" | |
| id="toc-threadlocal-storage-tls">6️⃣ Thread‑Local Storage (TLS)</a></li> | |
| <li><a href="#exceptionunwind-info" id="toc-exceptionunwind-info">7️⃣ | |
| Exception/Unwind Info</a></li> | |
| </ul></li> | |
| <li><a href="#appendix-c-nasmgasmasm-syntax-comparison" | |
| id="toc-appendix-c-nasmgasmasm-syntax-comparison">Appendix C: | |
| NASM/GAS/MASM Syntax Comparison</a> | |
| <ul> | |
| <li><a href="#c.1-basic-syntax-differences" | |
| id="toc-c.1-basic-syntax-differences">C.1 Basic Syntax Differences</a> | |
| <ul> | |
| <li><a href="#instruction-format" | |
| id="toc-instruction-format">Instruction Format</a></li> | |
| <li><a href="#basic-instruction-examples" | |
| id="toc-basic-instruction-examples">Basic Instruction Examples</a></li> | |
| </ul></li> | |
| <li><a href="#c.2-memory-addressing" id="toc-c.2-memory-addressing">C.2 | |
| Memory Addressing</a> | |
| <ul> | |
| <li><a href="#direct-memory-access" id="toc-direct-memory-access">Direct | |
| Memory Access</a></li> | |
| <li><a href="#complex-addressing-modes-1" | |
| id="toc-complex-addressing-modes-1">Complex Addressing Modes</a></li> | |
| </ul></li> | |
| <li><a href="#c.3-data-definitions" id="toc-c.3-data-definitions">C.3 | |
| Data Definitions</a> | |
| <ul> | |
| <li><a href="#basic-data-types" id="toc-basic-data-types">Basic Data | |
| Types</a></li> | |
| <li><a href="#string-definitions" id="toc-string-definitions">String | |
| Definitions</a></li> | |
| </ul></li> | |
| <li><a href="#c.4-sections-and-segments" | |
| id="toc-c.4-sections-and-segments">C.4 Sections and Segments</a></li> | |
| <li><a href="#c.5-macros-and-directives" | |
| id="toc-c.5-macros-and-directives">C.5 Macros and Directives</a> | |
| <ul> | |
| <li><a href="#macro-definitions" id="toc-macro-definitions">Macro | |
| Definitions</a></li> | |
| <li><a href="#conditional-assembly" | |
| id="toc-conditional-assembly">Conditional Assembly</a></li> | |
| </ul></li> | |
| <li><a href="#c.6-symbols-and-labels" | |
| id="toc-c.6-symbols-and-labels">C.6 Symbols and Labels</a> | |
| <ul> | |
| <li><a href="#global-and-external-symbols" | |
| id="toc-global-and-external-symbols">Global and External | |
| Symbols</a></li> | |
| <li><a href="#alignment-directives" | |
| id="toc-alignment-directives">Alignment Directives</a></li> | |
| </ul></li> | |
| <li><a href="#c.7-procedure-definitions" | |
| id="toc-c.7-procedure-definitions">C.7 Procedure Definitions</a> | |
| <ul> | |
| <li><a href="#function-declaration" | |
| id="toc-function-declaration">Function Declaration</a></li> | |
| </ul></li> | |
| <li><a href="#c.8-simd-instructions" id="toc-c.8-simd-instructions">C.8 | |
| SIMD Instructions</a> | |
| <ul> | |
| <li><a href="#sseavx-instructions" id="toc-sseavx-instructions">SSE/AVX | |
| Instructions</a></li> | |
| <li><a href="#avx-512-with-masking" | |
| id="toc-avx-512-with-masking">AVX-512 with Masking</a></li> | |
| </ul></li> | |
| <li><a href="#c.9-system-instructions" | |
| id="toc-c.9-system-instructions">C.9 System Instructions</a> | |
| <ul> | |
| <li><a href="#privileged-instructions" | |
| id="toc-privileged-instructions">Privileged Instructions</a></li> | |
| </ul></li> | |
| </ul></li> | |
| </ul> | |
| </nav> | |
| <main> | |
| <header id="title-block-header"> | |
| <h1 class="title">Dossier - x86asm-dossier</h1> | |
| </header> | |
| <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-XjKyOOlGwcjNTAIQHIpgOno0Hl1YQqzUOEleOLALmuqehneUG+vnGctmUb0ZY0l8" crossorigin="anonymous"></script> | |
| <!-- Auto-render extension for automatic math rendering --> | |
| <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-+VBxd3r6XgURycqtZ117nYw44OOcIax56Z4dCRWbxyPt0Koah1uHoK0o4+/RRE05" crossorigin="anonymous"></script> | |
| <!-- Initialize KaTeX auto-render --> | |
| <script> | |
| document.addEventListener("DOMContentLoaded", function() { | |
| renderMathInElement(document.body, { | |
| delimiters: [ | |
| {left: "", right: "", display: true}, | |
| {left: "\\[", right: "\\]", display: true}, | |
| {left: "\\(", right: "\\)", display: true} | |
| ], | |
| throwOnError: false | |
| }); | |
| }); | |
| </script> | |
| <h2 id="chapter-1-introduction-to-x86-64-architecture"><strong>Chapter | |
| 1: Introduction to x86-64 Architecture</strong></h2> | |
| <h3 id="evolution-from-8086-to-x86-64"><strong>1.1 Evolution from 8086 | |
| to x86-64</strong></h3> | |
| <h4 id="the-journey-from-16-bit-to-64-bit"><strong>The Journey from | |
| 16-bit to 64-bit</strong></h4> | |
| <p>The x86-64 architecture, also known as AMD64 or Intel 64, represents | |
| the culmination of over four decades of evolutionary development that | |
| began with Intel’s 8086 processor in 1978. Understanding this evolution | |
| is crucial for both assembly programmers and compiler engineers, as many | |
| architectural decisions in modern x86-64 CPUs stem from maintaining | |
| backward compatibility while extending capabilities.</p> | |
| <p>The 8086 introduced a 16-bit architecture with segmented memory | |
| addressing, allowing access to 1MB of memory through 20-bit addresses | |
| formed by combining 16-bit segment and offset values. This seemingly | |
| simple design decision would influence x86 architecture for decades to | |
| come:</p> | |
| <pre class="assembly"><code>; 8086 segmented addressing example | |
| mov ax, 0x1234 ; Load segment value | |
| mov ds, ax ; Set data segment | |
| mov bx, [0x5678] ; Access memory at DS:0x5678 (physical: 0x179B8)</code></pre> | |
| <h4 id="the-32-bit-revolution-80386-and-ia-32"><strong>The 32-bit | |
| Revolution: 80386 and IA-32</strong></h4> | |
| <p>The 80386, introduced in 1985, brought true 32-bit computing to the | |
| x86 family. This processor introduced:</p> | |
| <ul> | |
| <li><p><strong>32-bit general-purpose registers</strong> (EAX, EBX, ECX, | |
| EDX, ESI, EDI, EBP, ESP)</p></li> | |
| <li><p><strong>Flat memory model</strong> with 4GB address | |
| space</p></li> | |
| <li><p><strong>Protected mode</strong> with privilege levels and memory | |
| protection</p></li> | |
| <li><p><strong>Virtual memory</strong> support with paging</p></li> | |
| </ul> | |
| <pre class="assembly"><code>; 32-bit code example | |
| mov eax, [ebx + ecx*4 + 0x1000] ; Complex addressing modes | |
| push ebp ; 32-bit stack operations | |
| mov ebp, esp</code></pre> | |
| <p>The IA-32 architecture maintained full backward compatibility, | |
| running 16-bit code in “real mode” or “virtual 8086 mode” while offering | |
| protected mode for modern operating systems.</p> | |
| <h4 id="the-64-bit-extension-amd64-and-intel-64"><strong>The 64-bit | |
| Extension: AMD64 and Intel 64</strong></h4> | |
| <p>In 2003, AMD introduced the x86-64 architecture with the Opteron and | |
| Athlon 64 processors, later adopted by Intel as Intel 64. This extension | |
| brought revolutionary changes while maintaining the x86 legacy:</p> | |
| <p><strong>Key Enhancements:</strong></p> | |
| <ul> | |
| <li><p><strong>64-bit general-purpose registers</strong> (RAX, RBX, RCX, | |
| RDX, RSI, RDI, RBP, RSP)</p></li> | |
| <li><p><strong>Eight new general-purpose registers</strong> | |
| (R8-R15)</p></li> | |
| <li><p><strong>64-bit instruction pointer</strong> (RIP) with | |
| RIP-relative addressing</p></li> | |
| <li><p><strong>Larger virtual address space</strong> (48-bit in initial | |
| implementations, up to 57-bit in recent CPUs)</p></li> | |
| <li><p><strong>SSE2 as baseline</strong> floating-point | |
| architecture</p></li> | |
| <li><p><strong>NX bit</strong> for enhanced security</p></li> | |
| </ul> | |
| <pre class="assembly"><code>; 64-bit code showcasing new features | |
| mov rax, 0x123456789ABCDEF0 ; 64-bit immediate | |
| mov r10, [rip + data_label] ; RIP-relative addressing | |
| add r8d, r9d ; New registers (32-bit portion) | |
| movaps xmm0, [rsp + 16] ; SSE mandatory in 64-bit mode</code></pre> | |
| <h4 id="compiler-perspective-evolutionary-complexity"><strong>Compiler | |
| Perspective: Evolutionary Complexity</strong></h4> | |
| <p>From a compiler’s viewpoint, this evolution presents both | |
| opportunities and challenges:</p> | |
| <ol type="1"> | |
| <li><p><strong>Register Allocation</strong>: The increase from 8 to 16 | |
| general-purpose registers dramatically improves register allocation | |
| algorithms’ effectiveness, reducing memory traffic.</p></li> | |
| <li><p><strong>Addressing Modes</strong>: The addition of RIP-relative | |
| addressing enables position-independent code generation without | |
| performance penalties.</p></li> | |
| <li><p><strong>Compatibility Burden</strong>: Compilers must handle | |
| multiple target modes (16-bit, 32-bit, 64-bit) with different | |
| instruction encodings and constraints.</p></li> | |
| </ol> | |
| <h3 id="x86-64-execution-environment-and-modes"><strong>1.2 x86-64 | |
| Execution Environment and Modes</strong></h3> | |
| <h4 id="operating-modes"><strong>Operating Modes</strong></h4> | |
| <p>The x86-64 architecture supports several operating modes, each with | |
| distinct characteristics:</p> | |
| <h5 id="long-mode-64-bit-mode"><strong>Long Mode (64-bit | |
| Mode)</strong></h5> | |
| <p>The primary operating mode for modern operating systems, consisting | |
| of two sub-modes:</p> | |
| <ul> | |
| <li><p><strong>64-bit Mode</strong>: Full 64-bit operation with all | |
| architectural enhancements</p></li> | |
| <li><p><strong>Compatibility Mode</strong>: Runs legacy 32-bit and | |
| 16-bit protected mode applications without modification</p></li> | |
| </ul> | |
| <pre class="assembly"><code>; 64-bit mode characteristics | |
| ; Default operand size: 32-bit | |
| mov eax, ebx ; 32-bit operation (default) | |
| mov rax, rbx ; 64-bit operation (REX prefix required) | |
| ; Default address size: 64-bit | |
| mov rax, [rbx] ; 64-bit addressing | |
| mov rax, [ebx] ; 32-bit addressing (0x67 prefix)</code></pre> | |
| <h5 id="legacy-modes"><strong>Legacy Modes</strong></h5> | |
| <ul> | |
| <li><p><strong>Protected Mode</strong>: 32-bit operation, used by 32-bit | |
| operating systems</p></li> | |
| <li><p><strong>Real Mode</strong>: 16-bit operation, used during system | |
| boot</p></li> | |
| <li><p><strong>System Management Mode (SMM)</strong>: Special mode for | |
| system firmware</p></li> | |
| </ul> | |
| <h4 id="execution-state"><strong>Execution State</strong></h4> | |
| <p>The processor execution state in 64-bit mode includes:</p> | |
| <div class="sourceCode" id="cb5"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb5-1"><a href="#cb5-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Conceptual representation of CPU state</span></span> | |
| <span id="cb5-2"><a href="#cb5-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> X86_64_State <span class="op">{</span></span> | |
| <span id="cb5-3"><a href="#cb5-3" aria-hidden="true" tabindex="-1"></a> <span class="co">// General-purpose registers</span></span> | |
| <span id="cb5-4"><a href="#cb5-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> rax<span class="op">,</span> rbx<span class="op">,</span> rcx<span class="op">,</span> rdx<span class="op">;</span></span> | |
| <span id="cb5-5"><a href="#cb5-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> rsi<span class="op">,</span> rdi<span class="op">,</span> rbp<span class="op">,</span> rsp<span class="op">;</span></span> | |
| <span id="cb5-6"><a href="#cb5-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> r8<span class="op">,</span> r9<span class="op">,</span> r10<span class="op">,</span> r11<span class="op">;</span></span> | |
| <span id="cb5-7"><a href="#cb5-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> r12<span class="op">,</span> r13<span class="op">,</span> r14<span class="op">,</span> r15<span class="op">;</span></span> | |
| <span id="cb5-8"><a href="#cb5-8" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-9"><a href="#cb5-9" aria-hidden="true" tabindex="-1"></a> <span class="co">// Instruction pointer</span></span> | |
| <span id="cb5-10"><a href="#cb5-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> rip<span class="op">;</span></span> | |
| <span id="cb5-11"><a href="#cb5-11" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-12"><a href="#cb5-12" aria-hidden="true" tabindex="-1"></a> <span class="co">// Flags register</span></span> | |
| <span id="cb5-13"><a href="#cb5-13" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> rflags<span class="op">;</span></span> | |
| <span id="cb5-14"><a href="#cb5-14" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-15"><a href="#cb5-15" aria-hidden="true" tabindex="-1"></a> <span class="co">// Segment registers (mostly unused in 64-bit)</span></span> | |
| <span id="cb5-16"><a href="#cb5-16" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint16_t</span> cs<span class="op">,</span> ds<span class="op">,</span> es<span class="op">,</span> fs<span class="op">,</span> gs<span class="op">,</span> ss<span class="op">;</span></span> | |
| <span id="cb5-17"><a href="#cb5-17" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-18"><a href="#cb5-18" aria-hidden="true" tabindex="-1"></a> <span class="co">// Control registers</span></span> | |
| <span id="cb5-19"><a href="#cb5-19" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> cr0<span class="op">,</span> cr2<span class="op">,</span> cr3<span class="op">,</span> cr4<span class="op">,</span> cr8<span class="op">;</span></span> | |
| <span id="cb5-20"><a href="#cb5-20" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-21"><a href="#cb5-21" aria-hidden="true" tabindex="-1"></a> <span class="co">// Debug registers</span></span> | |
| <span id="cb5-22"><a href="#cb5-22" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> dr0<span class="op">,</span> dr1<span class="op">,</span> dr2<span class="op">,</span> dr3<span class="op">;</span></span> | |
| <span id="cb5-23"><a href="#cb5-23" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> dr6<span class="op">,</span> dr7<span class="op">;</span></span> | |
| <span id="cb5-24"><a href="#cb5-24" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-25"><a href="#cb5-25" aria-hidden="true" tabindex="-1"></a> <span class="co">// XMM/YMM/ZMM registers for SIMD</span></span> | |
| <span id="cb5-26"><a href="#cb5-26" aria-hidden="true" tabindex="-1"></a> <span class="kw">union</span> <span class="op">{</span></span> | |
| <span id="cb5-27"><a href="#cb5-27" aria-hidden="true" tabindex="-1"></a> uint128_t xmm<span class="op">[</span><span class="dv">32</span><span class="op">];</span> <span class="co">// SSE</span></span> | |
| <span id="cb5-28"><a href="#cb5-28" aria-hidden="true" tabindex="-1"></a> uint256_t ymm<span class="op">[</span><span class="dv">32</span><span class="op">];</span> <span class="co">// AVX</span></span> | |
| <span id="cb5-29"><a href="#cb5-29" aria-hidden="true" tabindex="-1"></a> uint512_t zmm<span class="op">[</span><span class="dv">32</span><span class="op">];</span> <span class="co">// AVX-512</span></span> | |
| <span id="cb5-30"><a href="#cb5-30" aria-hidden="true" tabindex="-1"></a> <span class="op">};</span></span> | |
| <span id="cb5-31"><a href="#cb5-31" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb5-32"><a href="#cb5-32" aria-hidden="true" tabindex="-1"></a> <span class="co">// x87 FPU state (legacy)</span></span> | |
| <span id="cb5-33"><a href="#cb5-33" aria-hidden="true" tabindex="-1"></a> <span class="dt">long</span> <span class="dt">double</span> st<span class="op">[</span><span class="dv">8</span><span class="op">];</span></span> | |
| <span id="cb5-34"><a href="#cb5-34" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint16_t</span> fpu_control<span class="op">,</span> fpu_status<span class="op">,</span> fpu_tag<span class="op">;</span></span> | |
| <span id="cb5-35"><a href="#cb5-35" aria-hidden="true" tabindex="-1"></a><span class="op">};</span></span></code></pre></div> | |
| <h4 id="privilege-levels-and-protection"><strong>Privilege Levels and | |
| Protection</strong></h4> | |
| <p>x86-64 maintains the four privilege levels (rings) from IA-32:</p> | |
| <ul> | |
| <li><p><strong>Ring 0</strong>: Kernel mode (highest privilege)</p></li> | |
| <li><p><strong>Ring 1-2</strong>: Rarely used (device drivers in some | |
| systems)</p></li> | |
| <li><p><strong>Ring 3</strong>: User mode (lowest privilege)</p></li> | |
| </ul> | |
| <pre class="assembly"><code>; Checking current privilege level | |
| mov rax, cs | |
| and rax, 3 ; Extract CPL (Current Privilege Level) | |
| jz kernel_mode ; Jump if in ring 0</code></pre> | |
| <h3 | |
| id="register-architecture-general-purpose-segment-and-system-registers"><strong>1.3 | |
| Register Architecture: General Purpose, Segment, and System | |
| Registers</strong></h3> | |
| <h4 id="general-purpose-registers"><strong>General-Purpose | |
| Registers</strong></h4> | |
| <p>The x86-64 architecture provides 16 general-purpose registers, each | |
| 64 bits wide, with accessible sub-registers:</p> | |
| <pre class="assembly"><code>; Register naming conventions and sub-registers | |
| ; 64-bit | 32-bit | 16-bit | 8-bit high | 8-bit low | |
| ; RAX | EAX | AX | AH | AL | |
| ; RBX | EBX | BX | BH | BL | |
| ; RCX | ECX | CX | CH | CL | |
| ; RDX | EDX | DX | DH | DL | |
| ; RSI | ESI | SI | - | SIL | |
| ; RDI | EDI | DI | - | DIL | |
| ; RBP | EBP | BP | - | BPL | |
| ; RSP | ESP | SP | - | SPL | |
| ; R8 | R8D | R8W | - | R8B | |
| ; R9 | R9D | R9W | - | R9B | |
| ; R10 | R10D | R10W | - | R10B | |
| ; R11 | R11D | R11W | - | R11B | |
| ; R12 | R12D | R12W | - | R12B | |
| ; R13 | R13D | R13W | - | R13B | |
| ; R14 | R14D | R14W | - | R14B | |
| ; R15 | R15D | R15W | - | R15B</code></pre> | |
| <p><strong>Important Behavior</strong>: Operations on 32-bit | |
| sub-registers zero-extend to 64 bits:</p> | |
| <pre class="assembly"><code> mov rax, 0xFFFFFFFFFFFFFFFF | |
| mov eax, 0x12345678 ; RAX now contains 0x0000000012345678 | |
| ; But 8-bit and 16-bit operations don't zero-extend | |
| mov rax, 0xFFFFFFFFFFFFFFFF | |
| mov ax, 0x1234 ; RAX now contains 0xFFFFFFFFFFFF1234 | |
| mov al, 0x56 ; RAX now contains 0xFFFFFFFFFFFF1256</code></pre> | |
| <h4 id="special-purpose-registers"><strong>Special-Purpose | |
| Registers</strong></h4> | |
| <pre class="assembly"><code>; RFLAGS register (selected bits) | |
| ; Bit | Name | Description | |
| ; 0 | CF | Carry Flag | |
| ; 2 | PF | Parity Flag | |
| ; 4 | AF | Auxiliary Carry Flag | |
| ; 6 | ZF | Zero Flag | |
| ; 7 | SF | Sign Flag | |
| ; 8 | TF | Trap Flag | |
| ; 9 | IF | Interrupt Enable Flag | |
| ; 10 | DF | Direction Flag | |
| ; 11 | OF | Overflow Flag | |
| ; 12-13| IOPL| I/O Privilege Level | |
| ; 14 | NT | Nested Task | |
| ; 16 | RF | Resume Flag | |
| ; 17 | VM | Virtual-8086 Mode | |
| ; 18 | AC | Alignment Check | |
| ; 19 | VIF | Virtual Interrupt Flag | |
| ; 20 | VIP | Virtual Interrupt Pending | |
| ; 21 | ID | CPUID available | |
| pushfq ; Push RFLAGS | |
| pop rax ; Read RFLAGS into RAX | |
| or rax, 0x200 ; Set IF (enable interrupts) | |
| push rax | |
| popfq ; Restore modified RFLAGS</code></pre> | |
| <h4 id="segment-registers-in-64-bit-mode"><strong>Segment Registers in | |
| 64-bit Mode</strong></h4> | |
| <p>While segmentation is largely disabled in 64-bit mode, segment | |
| registers still serve important purposes:</p> | |
| <pre class="assembly"><code>; CS (Code Segment) - determines execution mode and privilege | |
| ; SS (Stack Segment) - largely ignored, but checked for NULL | |
| ; DS, ES - completely ignored in 64-bit mode | |
| ; FS, GS - used for thread-local storage and special OS purposes | |
| ; Typical FS/GS usage in Linux | |
| mov rax, fs:[0] ; Access thread-local storage | |
| ; Windows uses GS for TEB (Thread Environment Block) | |
| mov rax, gs:[0x30] ; Get PEB pointer from TEB</code></pre> | |
| <h4 id="control-registers"><strong>Control Registers</strong></h4> | |
| <p>Control registers govern fundamental CPU behavior:</p> | |
| <pre class="assembly"><code>; CR0 - System control flags | |
| ; Bit 0 (PE): Protected Mode Enable | |
| ; Bit 16 (WP): Write Protect | |
| ; Bit 31 (PG): Paging Enable | |
| ; CR3 - Page Directory Base (top-level page table pointer) | |
| mov rax, cr3 ; Read current page table base | |
| and rax, ~0xFFF ; Mask out PCID and flags | |
| mov cr3, rax ; Flush TLB by reloading CR3 | |
| ; CR4 - Architecture extensions | |
| ; Bit 5 (PAE): Physical Address Extension | |
| ; Bit 7 (PGE): Page Global Enable | |
| ; Bit 10 (OSXMMEXCPT): OS SIMD exception support | |
| ; Bit 18 (OSXSAVE): XSAVE enabled</code></pre> | |
| <h4 id="model-specific-registers-msrs"><strong>Model-Specific Registers | |
| (MSRs)</strong></h4> | |
| <p>MSRs provide access to processor-specific features:</p> | |
| <pre class="assembly"><code>; Reading MSRs | |
| mov ecx, 0xC0000080 ; EFER MSR (Extended Feature Enable) | |
| rdmsr ; Read MSR into EDX:EAX | |
| ; Bit 8 (LME): Long Mode Enable | |
| ; Bit 10 (LMA): Long Mode Active | |
| ; Bit 11 (NXE): No-Execute Enable | |
| ; Writing MSRs (privileged operation) | |
| mov ecx, 0x277 ; IA32_PAT MSR (Page Attribute Table) | |
| mov edx, 0x00070406 | |
| mov eax, 0x00070406 | |
| wrmsr ; Write EDX:EAX to MSR</code></pre> | |
| <h4 id="compiler-register-usage-conventions"><strong>Compiler Register | |
| Usage Conventions</strong></h4> | |
| <p>Different ABIs specify register usage:</p> | |
| <div class="sourceCode" id="cb13"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb13-1"><a href="#cb13-1" aria-hidden="true" tabindex="-1"></a><span class="co">// System V AMD64 ABI (Linux, macOS, BSD)</span></span> | |
| <span id="cb13-2"><a href="#cb13-2" aria-hidden="true" tabindex="-1"></a><span class="co">// Function parameters: RDI, RSI, RDX, RCX, R8, R9</span></span> | |
| <span id="cb13-3"><a href="#cb13-3" aria-hidden="true" tabindex="-1"></a><span class="co">// Return value: RAX (RDX:RAX for 128-bit)</span></span> | |
| <span id="cb13-4"><a href="#cb13-4" aria-hidden="true" tabindex="-1"></a><span class="co">// Callee-saved: RBX, RBP, R12-R15</span></span> | |
| <span id="cb13-5"><a href="#cb13-5" aria-hidden="true" tabindex="-1"></a><span class="co">// Caller-saved: RAX, RCX, RDX, RSI, RDI, R8-R11</span></span> | |
| <span id="cb13-6"><a href="#cb13-6" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb13-7"><a href="#cb13-7" aria-hidden="true" tabindex="-1"></a><span class="co">// Microsoft x64 ABI (Windows)</span></span> | |
| <span id="cb13-8"><a href="#cb13-8" aria-hidden="true" tabindex="-1"></a><span class="co">// Function parameters: RCX, RDX, R8, R9</span></span> | |
| <span id="cb13-9"><a href="#cb13-9" aria-hidden="true" tabindex="-1"></a><span class="co">// Return value: RAX</span></span> | |
| <span id="cb13-10"><a href="#cb13-10" aria-hidden="true" tabindex="-1"></a><span class="co">// Callee-saved: RBX, RBP, RDI, RSI, RSP, R12-R15</span></span> | |
| <span id="cb13-11"><a href="#cb13-11" aria-hidden="true" tabindex="-1"></a><span class="co">// Caller-saved: RAX, RCX, RDX, R8-R11</span></span></code></pre></div> | |
| <h3 id="memory-models-and-addressing"><strong>1.4 Memory Models and | |
| Addressing</strong></h3> | |
| <h4 id="virtual-address-space"><strong>Virtual Address | |
| Space</strong></h4> | |
| <p>The x86-64 architecture implements a 64-bit virtual address space, | |
| though current implementations use only 48-57 bits:</p> | |
| <div class="sourceCode" id="cb14"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb14-1"><a href="#cb14-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Canonical address form (48-bit implementation)</span></span> | |
| <span id="cb14-2"><a href="#cb14-2" aria-hidden="true" tabindex="-1"></a><span class="co">// Bits 63:48 must be copies of bit 47</span></span> | |
| <span id="cb14-3"><a href="#cb14-3" aria-hidden="true" tabindex="-1"></a><span class="co">// Valid ranges:</span></span> | |
| <span id="cb14-4"><a href="#cb14-4" aria-hidden="true" tabindex="-1"></a><span class="co">// 0x0000000000000000 - 0x00007FFFFFFFFFFF (user space)</span></span> | |
| <span id="cb14-5"><a href="#cb14-5" aria-hidden="true" tabindex="-1"></a><span class="co">// 0xFFFF800000000000 - 0xFFFFFFFFFFFFFFFF (kernel space)</span></span> | |
| <span id="cb14-6"><a href="#cb14-6" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb14-7"><a href="#cb14-7" aria-hidden="true" tabindex="-1"></a><span class="dt">bool</span> is_canonical_address<span class="op">(</span><span class="dt">uint64_t</span> addr<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb14-8"><a href="#cb14-8" aria-hidden="true" tabindex="-1"></a> <span class="co">// Check if address is in canonical form</span></span> | |
| <span id="cb14-9"><a href="#cb14-9" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> high_bits <span class="op">=</span> addr <span class="op">>></span> <span class="dv">47</span><span class="op">;</span></span> | |
| <span id="cb14-10"><a href="#cb14-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> high_bits <span class="op">==</span> <span class="dv">0</span> <span class="op">||</span> high_bits <span class="op">==</span> <span class="bn">0x1FFFF</span><span class="op">;</span></span> | |
| <span id="cb14-11"><a href="#cb14-11" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span></code></pre></div> | |
| <h4 id="memory-segmentation-in-64-bit-mode"><strong>Memory Segmentation | |
| in 64-bit Mode</strong></h4> | |
| <p>Segmentation is largely disabled in 64-bit mode:</p> | |
| <pre class="assembly"><code>; Segment registers in 64-bit mode: | |
| ; - Base addresses forced to 0 (except FS/GS) | |
| ; - Limits not checked (except for FS/GS in some cases) | |
| ; - CS still determines privilege level and operating mode | |
| ; Setting up FS base for thread-local storage | |
| mov ecx, 0xC0000100 ; FS_BASE MSR | |
| mov edx, 0 ; High 32 bits of base | |
| mov eax, thread_data ; Low 32 bits of base | |
| wrmsr | |
| ; Now FS-relative addressing uses thread_data as base | |
| mov rax, fs:[0] ; Load from thread_data + 0</code></pre> | |
| <h4 id="addressing-modes"><strong>Addressing Modes</strong></h4> | |
| <p>x86-64 supports complex addressing modes with the general form: | |
| <strong>[base + index*scale + displacement]</strong></p> | |
| <pre class="assembly"><code>; Direct addressing | |
| mov rax, [0x401000] ; Absolute address (rare in 64-bit) | |
| ; Register indirect | |
| mov rax, [rbx] ; Address in RBX | |
| ; Register + displacement | |
| mov rax, [rbx + 8] ; RBX + 8 | |
| mov rax, [rbx - 16] ; RBX - 16 | |
| ; Indexed addressing | |
| mov rax, [rbx + rcx*8] ; RBX + RCX*8 (scale: 1, 2, 4, or 8) | |
| ; Full complex addressing</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-2-x86-64-instruction-set-architecture-fundamentals"><strong>Chapter | |
| 2: x86-64 Instruction Set Architecture Fundamentals</strong></h2> | |
| <h3 id="instruction-format-and-prefixes-rex-vex-evex"><strong>2.1 | |
| Instruction Format and Prefixes (REX, VEX, EVEX)</strong></h3> | |
| <h4 id="basic-instruction-format"><strong>Basic Instruction | |
| Format</strong></h4> | |
| <p>x86-64 instructions consist of several optional and mandatory | |
| components that can create instructions from 1 to 15 bytes in | |
| length:</p> | |
| <p>[Prefixes] [REX] [Opcode] [ModR/M] [SIB] [Displacement] | |
| [Immediate]</p> | |
| <p>Let’s examine each component:</p> | |
| <pre class="assembly"><code>; Example: mov rax, [rbx + rcx*8 + 0x1000] | |
| ; Encoding: 48 8B 84 CB 00 10 00 00 | |
| ; 48 - REX.W prefix (64-bit operand) | |
| ; 8B - Opcode (MOV r64, r/m64) | |
| ; 84 - ModR/M byte (mod=10, reg=000, r/m=100) | |
| ; CB - SIB byte (scale=11, index=001, base=011) | |
| ; 00 10 00 00 - 32-bit displacement (0x1000)</code></pre> | |
| <h4 id="legacy-prefixes"><strong>Legacy Prefixes</strong></h4> | |
| <p>Legacy prefixes modify instruction behavior and can appear in any | |
| order:</p> | |
| <pre class="assembly"><code>; Prefix groups (max one from each group): | |
| ; Group 1: Lock and repeat | |
| lock add [rax], rbx ; F0 - LOCK prefix for atomic operations | |
| rep movsb ; F3 - REP prefix for string operations | |
| ; Group 2: Segment override (largely ignored in 64-bit mode) | |
| mov rax, fs:[rbx] ; 64 - FS segment override | |
| mov rax, gs:[0] ; 65 - GS segment override | |
| ; Group 3: Operand size override | |
| mov ax, bx ; 66 - 16-bit operation in 64-bit mode | |
| ; Group 4: Address size override | |
| mov rax, [ebx] ; 67 - 32-bit addressing in 64-bit mode</code></pre> | |
| <h4 id="rex-prefix"><strong>REX Prefix</strong></h4> | |
| <p>The REX (Register Extension) prefix is crucial for 64-bit operations | |
| and accessing extended registers:</p> | |
| <p>REX = 0100WRXB</p> | |
| <ul> | |
| <li><p>W: 64-bit operand size</p></li> | |
| <li><p>R: Extension of ModR/M reg field</p></li> | |
| <li><p>X: Extension of SIB index field</p></li> | |
| <li><p>B: Extension of ModR/M r/m field, SIB base, or opcode | |
| reg</p></li> | |
| </ul> | |
| <pre class="assembly"><code>; REX prefix examples | |
| mov r8, rax ; 49 89 C0 - REX.B for r8 | |
| mov rax, r9 ; 4C 89 C8 - REX.R for r9 | |
| mov r10, r11 ; 4D 89 DA - REX.RB for both | |
| mov eax, ebx ; 89 D8 - No REX (32-bit) | |
| mov rax, rbx ; 48 89 D8 - REX.W (64-bit)</code></pre> | |
| <h4 id="vex-prefix-avx"><strong>VEX Prefix (AVX)</strong></h4> | |
| <p>VEX encoding enables three-operand forms and accesses YMM | |
| registers:</p> | |
| <pre class="assembly"><code>; 2-byte VEX: C5 [R vvvv L pp] | |
| ; 3-byte VEX: C4 [RXB map] [W vvvv L pp] | |
| vaddps ymm0, ymm1, ymm2 ; C5 F4 58 C2 | |
| ; Non-destructive: ymm0 = ymm1 + ymm2 | |
| ; Compare with legacy SSE: | |
| addps xmm0, xmm1 ; 0F 58 C1 | |
| ; Destructive: xmm0 = xmm0 + xmm1</code></pre> | |
| <h4 id="evex-prefix-avx-512"><strong>EVEX Prefix (AVX-512)</strong></h4> | |
| <p>EVEX extends VEX with masking, broadcasting, and 512-bit | |
| operations:</p> | |
| <pre class="assembly"><code>; 4-byte EVEX: 62 [RXBR'00mm] [Wvvvv1pp] [zLLb Vaaa] | |
| vaddps zmm0{k1}, zmm1, zmm2 ; Masked addition | |
| vbroadcastss zmm0, [rax] ; Broadcast single value | |
| vaddps zmm0, zmm1, [rax]{1to16} ; Memory broadcast</code></pre> | |
| <h4 id="compiler-encoding-decisions"><strong>Compiler Encoding | |
| Decisions</strong></h4> | |
| <p>Compilers must choose optimal encodings:</p> | |
| <div class="sourceCode" id="cb22"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb22-1"><a href="#cb22-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler's encoding selection logic</span></span> | |
| <span id="cb22-2"><a href="#cb22-2" aria-hidden="true" tabindex="-1"></a><span class="dt">void</span> select_encoding<span class="op">(</span>Instruction<span class="op">*</span> insn<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb22-3"><a href="#cb22-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="op">(</span>insn<span class="op">-></span>needs_rex<span class="op">())</span> <span class="op">{</span></span> | |
| <span id="cb22-4"><a href="#cb22-4" aria-hidden="true" tabindex="-1"></a> <span class="co">// Use REX for extended registers or 64-bit ops</span></span> | |
| <span id="cb22-5"><a href="#cb22-5" aria-hidden="true" tabindex="-1"></a> emit_rex<span class="op">(</span>insn<span class="op">);</span></span> | |
| <span id="cb22-6"><a href="#cb22-6" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span> <span class="cf">else</span> <span class="cf">if</span> <span class="op">(</span>insn<span class="op">-></span>is_vector<span class="op">()</span> <span class="op">&&</span> insn<span class="op">-></span>has_avx<span class="op">())</span> <span class="op">{</span></span> | |
| <span id="cb22-7"><a href="#cb22-7" aria-hidden="true" tabindex="-1"></a> <span class="co">// Prefer VEX for AVX instructions</span></span> | |
| <span id="cb22-8"><a href="#cb22-8" aria-hidden="true" tabindex="-1"></a> emit_vex<span class="op">(</span>insn<span class="op">);</span></span> | |
| <span id="cb22-9"><a href="#cb22-9" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span> <span class="cf">else</span> <span class="cf">if</span> <span class="op">(</span>insn<span class="op">-></span>needs_evex_features<span class="op">())</span> <span class="op">{</span></span> | |
| <span id="cb22-10"><a href="#cb22-10" aria-hidden="true" tabindex="-1"></a> <span class="co">// Use EVEX for AVX-512 or special features</span></span> | |
| <span id="cb22-11"><a href="#cb22-11" aria-hidden="true" tabindex="-1"></a> emit_evex<span class="op">(</span>insn<span class="op">);</span></span> | |
| <span id="cb22-12"><a href="#cb22-12" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span></span> | |
| <span id="cb22-13"><a href="#cb22-13" aria-hidden="true" tabindex="-1"></a> <span class="co">// Minimize instruction size when possible</span></span> | |
| <span id="cb22-14"><a href="#cb22-14" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span></code></pre></div> | |
| <h3 id="data-movement-instructions"><strong>2.2 Data Movement | |
| Instructions</strong></h3> | |
| <h4 id="basic-move-instructions"><strong>Basic Move | |
| Instructions</strong></h4> | |
| <p>The MOV instruction family forms the foundation of data movement:</p> | |
| <pre class="assembly"><code>; Register to register | |
| mov rax, rbx ; 64-bit | |
| mov eax, ebx ; 32-bit (zero-extends to 64-bit) | |
| mov ax, bx ; 16-bit (preserves upper bits) | |
| mov al, bl ; 8-bit (preserves upper bits) | |
| ; Immediate to register | |
| mov rax, 0x123456789 ; 64-bit immediate (10-byte encoding) | |
| mov eax, 0x12345678 ; 32-bit immediate (5-byte encoding) | |
| mov rax, -1 ; Optimized as: mov rax, 0xFFFFFFFFFFFFFFFF | |
| ; Memory operations | |
| mov rax, [rbx] ; Load | |
| mov [rbx], rax ; Store | |
| mov qword [rbx], 100 ; Immediate to memory</code></pre> | |
| <h4 id="zero-and-sign-extension"><strong>Zero and Sign | |
| Extension</strong></h4> | |
| <pre class="assembly"><code>; Zero extension | |
| movzx eax, byte [rbx] ; Zero-extend byte to 32-bit | |
| movzx rax, word [rbx] ; Zero-extend word to 64-bit | |
| ; Sign extension | |
| movsx eax, byte [rbx] ; Sign-extend byte to 32-bit | |
| movsxd rax, dword [rbx] ; Sign-extend dword to 64-bit | |
| ; Implicit zero extension with 32-bit ops | |
| mov eax, [rbx] ; Zeros bits 63:32</code></pre> | |
| <h4 id="conditional-moves"><strong>Conditional Moves</strong></h4> | |
| <p>Conditional moves eliminate branches for simple selections:</p> | |
| <pre class="assembly"><code>; cmovcc reg, reg/mem | |
| cmp rax, rbx | |
| cmovl rax, rcx ; Move if less (signed) | |
| cmovb rax, rcx ; Move if below (unsigned) | |
| ; Compiler pattern for: x = (a < b) ? c : d | |
| cmp rdi, rsi ; Compare a, b | |
| mov rax, r8 ; rax = d | |
| cmovl rax, rdx ; rax = c if a < b</code></pre> | |
| <h4 id="special-data-movement"><strong>Special Data | |
| Movement</strong></h4> | |
| <pre class="assembly"><code>; Exchange | |
| xchg rax, rbx ; Atomic exchange | |
| xchg [mem], rax ; Implicit LOCK prefix | |
| ; Load effective address | |
| lea rax, [rbx + rcx*8 + 16] ; Address calculation | |
| lea rdi, [rip + data] ; RIP-relative addressing | |
| ; Stack operations | |
| push rax ; RSP -= 8; [RSP] = RAX | |
| pop rbx ; RBX = [RSP]; RSP += 8 | |
| ; Special moves | |
| bswap rax ; Byte swap (endianness conversion) | |
| cmpxchg [rbx], rcx ; Compare and exchange (atomic)</code></pre> | |
| <h4 id="compiler-optimization-patterns"><strong>Compiler Optimization | |
| Patterns</strong></h4> | |
| <div class="sourceCode" id="cb27"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb27-1"><a href="#cb27-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Structure copy optimization</span></span> | |
| <span id="cb27-2"><a href="#cb27-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Point <span class="op">{</span> <span class="dt">long</span> x<span class="op">,</span> y<span class="op">,</span> z<span class="op">;</span> <span class="op">};</span></span> | |
| <span id="cb27-3"><a href="#cb27-3" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb27-4"><a href="#cb27-4" aria-hidden="true" tabindex="-1"></a><span class="co">// Naive approach: multiple loads/stores</span></span> | |
| <span id="cb27-5"><a href="#cb27-5" aria-hidden="true" tabindex="-1"></a><span class="dt">void</span> copy_naive<span class="op">(</span>Point<span class="op">*</span> dst<span class="op">,</span> Point<span class="op">*</span> src<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb27-6"><a href="#cb27-6" aria-hidden="true" tabindex="-1"></a> dst<span class="op">-></span>x <span class="op">=</span> src<span class="op">-></span>x<span class="op">;</span></span> | |
| <span id="cb27-7"><a href="#cb27-7" aria-hidden="true" tabindex="-1"></a> dst<span class="op">-></span>y <span class="op">=</span> src<span class="op">-></span>y<span class="op">;</span></span> | |
| <span id="cb27-8"><a href="#cb27-8" aria-hidden="true" tabindex="-1"></a> dst<span class="op">-></span>z <span class="op">=</span> src<span class="op">-></span>z<span class="op">;</span></span> | |
| <span id="cb27-9"><a href="#cb27-9" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb27-10"><a href="#cb27-10" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb27-11"><a href="#cb27-11" aria-hidden="true" tabindex="-1"></a><span class="co">// Optimized assembly:</span></span> | |
| <span id="cb27-12"><a href="#cb27-12" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rax, [rsi]</span></span> | |
| <span id="cb27-13"><a href="#cb27-13" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rdx, [rsi+8]</span></span> | |
| <span id="cb27-14"><a href="#cb27-14" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rcx, [rsi+16]</span></span> | |
| <span id="cb27-15"><a href="#cb27-15" aria-hidden="true" tabindex="-1"></a><span class="co">// mov [rdi], rax</span></span> | |
| <span id="cb27-16"><a href="#cb27-16" aria-hidden="true" tabindex="-1"></a><span class="co">// mov [rdi+8], rdx</span></span> | |
| <span id="cb27-17"><a href="#cb27-17" aria-hidden="true" tabindex="-1"></a><span class="co">// mov [rdi+16], rcx</span></span> | |
| <span id="cb27-18"><a href="#cb27-18" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb27-19"><a href="#cb27-19" aria-hidden="true" tabindex="-1"></a><span class="co">// Or with SIMD:</span></span> | |
| <span id="cb27-20"><a href="#cb27-20" aria-hidden="true" tabindex="-1"></a><span class="co">// movups xmm0, [rsi]</span></span> | |
| <span id="cb27-21"><a href="#cb27-21" aria-hidden="true" tabindex="-1"></a><span class="co">// movups xmm1, [rsi+16]</span></span> | |
| <span id="cb27-22"><a href="#cb27-22" aria-hidden="true" tabindex="-1"></a><span class="co">// movups [rdi], xmm0</span></span> | |
| <span id="cb27-23"><a href="#cb27-23" aria-hidden="true" tabindex="-1"></a><span class="co">// movups [rdi+16], xmm1</span></span></code></pre></div> | |
| <h3 id="arithmetic-and-logic-operations"><strong>2.3 Arithmetic and | |
| Logic Operations</strong></h3> | |
| <h4 id="integer-arithmetic"><strong>Integer Arithmetic</strong></h4> | |
| <pre class="assembly"><code>; Addition and subtraction | |
| add rax, rbx ; rax += rbx, sets flags | |
| adc rax, rbx ; rax += rbx + CF (multi-precision) | |
| sub rax, rbx ; rax -= rbx | |
| sbb rax, rbx ; rax -= rbx + CF | |
| ; Increment/decrement (don't affect CF) | |
| inc rax ; rax++ | |
| dec rbx ; rbx-- | |
| ; Multiplication | |
| mul rbx ; RDX:RAX = RAX * RBX (unsigned) | |
| imul rbx ; RDX:RAX = RAX * RBX (signed) | |
| imul rax, rbx ; RAX = RAX * RBX (truncated) | |
| imul rax, rbx, 5 ; RAX = RBX * 5 | |
| ; Division | |
| xor rdx, rdx ; Clear high dividend | |
| div rbx ; RAX = RDX:RAX / RBX, RDX = remainder | |
| idiv rbx ; Signed division | |
| ; LEA for arithmetic | |
| lea rax, [rbx + rcx] ; Addition without flags | |
| lea rax, [rbx + rbx*4] ; Multiply by 5 | |
| lea rax, [rbx + rbx*2 + 7] ; rax = rbx*3 + 7</code></pre> | |
| <h4 id="logical-operations"><strong>Logical Operations</strong></h4> | |
| <pre class="assembly"><code>; Bitwise operations | |
| and rax, rbx ; Bitwise AND | |
| or rax, rbx ; Bitwise OR | |
| xor rax, rbx ; Bitwise XOR | |
| not rax ; Bitwise NOT | |
| ; Testing without modifying | |
| test rax, rbx ; AND but only set flags | |
| test rax, rax ; Common idiom to check zero/sign | |
| ; Bit manipulation | |
| bt rax, 5 ; Test bit 5 | |
| bts rax, 5 ; Test and set bit 5 | |
| btr rax, 5 ; Test and reset bit 5 | |
| btc rax, 5 ; Test and complement bit 5</code></pre> | |
| <h4 id="flag-manipulation"><strong>Flag Manipulation</strong></h4> | |
| <pre class="assembly"><code>; Direct flag operations | |
| clc ; Clear carry flag | |
| stc ; Set carry flag | |
| cmc ; Complement carry flag | |
| cld ; Clear direction flag | |
| std ; Set direction flag | |
| ; Flag-based byte set | |
| cmp rax, rbx | |
| setl al ; AL = 1 if less, 0 otherwise | |
| sete al ; AL = 1 if equal | |
| ; Compiler pattern for: bool result = (a < b) | |
| cmp rdi, rsi | |
| setl al | |
| movzx eax, al ; Zero-extend to full register</code></pre> | |
| <h3 id="bit-manipulation-and-shifts"><strong>2.4 Bit Manipulation and | |
| Shifts</strong></h3> | |
| <h4 id="shift-operations"><strong>Shift Operations</strong></h4> | |
| <pre class="assembly"><code>; Logical shifts (fill with zeros) | |
| shl rax, 5 ; Shift left by 5 | |
| shr rax, cl ; Shift right by CL bits | |
| ; Arithmetic shifts (preserve sign) | |
| sal rax, 5 ; Same as SHL | |
| sar rax, cl ; Arithmetic right shift | |
| ; Rotates | |
| rol rax, 8 ; Rotate left | |
| ror rax, cl ; Rotate right | |
| rcl rax, 1 ; Rotate through carry left | |
| rcr rax, 1 ; Rotate through carry right | |
| ; Double-precision shifts | |
| shld rax, rbx, 5 ; Shift RAX left, fill from RBX | |
| shrd rax, rbx, cl ; Shift RAX right, fill from RBX</code></pre> | |
| <h4 id="bit-scanning-and-manipulation"><strong>Bit Scanning and | |
| Manipulation</strong></h4> | |
| <pre class="assembly"><code>; Find first set bit | |
| bsf rax, rbx ; Scan forward (LSB to MSB) | |
| bsr rax, rbx ; Scan reverse (MSB to LSB) | |
| ; Leading/trailing zeros (with BMI) | |
| lzcnt rax, rbx ; Count leading zeros | |
| tzcnt rax, rbx ; Count trailing zeros | |
| ; Population count | |
| popcnt rax, rbx ; Count set bits | |
| ; BMI extensions | |
| andn rax, rbx, rcx ; RAX = ~RBX & RCX | |
| blsi rax, rbx ; Extract lowest set bit | |
| blsr rax, rbx ; Reset lowest set bit | |
| blsmsk rax, rbx ; Mask up to lowest set bit | |
| ; BMI2 advanced operations | |
| pdep rax, rbx, rcx ; Parallel bit deposit | |
| pext rax, rbx, rcx ; Parallel bit extract | |
| bzhi rax, rbx, rcx ; Zero high bits | |
| mulx rdx, rax, rbx ; Unsigned multiply without flags</code></pre> | |
| <h4 id="compiler-bit-manipulation-patterns"><strong>Compiler Bit | |
| Manipulation Patterns</strong></h4> | |
| <div class="sourceCode" id="cb33"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb33-1"><a href="#cb33-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Efficient bit field extraction</span></span> | |
| <span id="cb33-2"><a href="#cb33-2" aria-hidden="true" tabindex="-1"></a><span class="dt">uint64_t</span> extract_bits<span class="op">(</span><span class="dt">uint64_t</span> value<span class="op">,</span> <span class="dt">int</span> start<span class="op">,</span> <span class="dt">int</span> length<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb33-3"><a href="#cb33-3" aria-hidden="true" tabindex="-1"></a> <span class="co">// Compiler may generate:</span></span> | |
| <span id="cb33-4"><a href="#cb33-4" aria-hidden="true" tabindex="-1"></a> <span class="co">// mov rax, rdi</span></span> | |
| <span id="cb33-5"><a href="#cb33-5" aria-hidden="true" tabindex="-1"></a> <span class="co">// shr rax, rsi ; Shift by start</span></span> | |
| <span id="cb33-6"><a href="#cb33-6" aria-hidden="true" tabindex="-1"></a> <span class="co">// bzhi rax, rax, rdx ; Zero bits above length</span></span> | |
| <span id="cb33-7"><a href="#cb33-7" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> <span class="op">(</span>value <span class="op">>></span> start<span class="op">)</span> <span class="op">&</span> <span class="op">((</span><span class="dv">1</span><span class="bu">ULL</span> <span class="op"><<</span> length<span class="op">)</span> <span class="op">-</span> <span class="dv">1</span><span class="op">);</span></span> | |
| <span id="cb33-8"><a href="#cb33-8" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb33-9"><a href="#cb33-9" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb33-10"><a href="#cb33-10" aria-hidden="true" tabindex="-1"></a><span class="co">// Bit permutation with PDEP</span></span> | |
| <span id="cb33-11"><a href="#cb33-11" aria-hidden="true" tabindex="-1"></a><span class="dt">uint64_t</span> pack_rgb_to_565<span class="op">(</span><span class="dt">uint8_t</span> r<span class="op">,</span> <span class="dt">uint8_t</span> g<span class="op">,</span> <span class="dt">uint8_t</span> b<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb33-12"><a href="#cb33-12" aria-hidden="true" tabindex="-1"></a> <span class="co">// With BMI2:</span></span> | |
| <span id="cb33-13"><a href="#cb33-13" aria-hidden="true" tabindex="-1"></a> <span class="co">// pdep eax, edi, 0xF800 ; R in bits 15:11</span></span> | |
| <span id="cb33-14"><a href="#cb33-14" aria-hidden="true" tabindex="-1"></a> <span class="co">// pdep ecx, esi, 0x07E0 ; G in bits 10:5</span></span> | |
| <span id="cb33-15"><a href="#cb33-15" aria-hidden="true" tabindex="-1"></a> <span class="co">// or eax, ecx</span></span> | |
| <span id="cb33-16"><a href="#cb33-16" aria-hidden="true" tabindex="-1"></a> <span class="co">// pdep ecx, edx, 0x001F ; B in bits 4:0</span></span> | |
| <span id="cb33-17"><a href="#cb33-17" aria-hidden="true" tabindex="-1"></a> <span class="co">// or eax, ecx</span></span> | |
| <span id="cb33-18"><a href="#cb33-18" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> <span class="op">((</span>r <span class="op">&</span> <span class="bn">0xF8</span><span class="op">)</span> <span class="op"><<</span> <span class="dv">8</span><span class="op">)</span> <span class="op">|</span> <span class="op">((</span>g <span class="op">&</span> <span class="bn">0xFC</span><span class="op">)</span> <span class="op"><<</span> <span class="dv">3</span><span class="op">)</span> <span class="op">|</span> <span class="op">(</span>b <span class="op">>></span> <span class="dv">3</span><span class="op">);</span></span> | |
| <span id="cb33-19"><a href="#cb33-19" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span></code></pre></div> | |
| <h3 id="control-flow-branches-loops-and-calls"><strong>2.5 Control Flow: | |
| Branches, Loops, and Calls</strong></h3> | |
| <h4 id="unconditional-jumps"><strong>Unconditional Jumps</strong></h4> | |
| <pre class="assembly"><code>; Direct jump | |
| jmp label ; RIP = label | |
| ; Indirect jump | |
| jmp rax ; RIP = RAX | |
| jmp qword [rbx] ; RIP = memory[RBX] | |
| ; Function calls | |
| call function ; Push return address, jump | |
| call rax ; Indirect call | |
| call qword [rbx + rax*8] ; Call through function table | |
| ; Returns | |
| ret ; Pop return address to RIP | |
| ret 16 ; Return and adjust RSP by 16</code></pre> | |
| <h4 id="conditional-branches"><strong>Conditional Branches</strong></h4> | |
| <pre class="assembly"><code>; Based on single flag | |
| je label ; Jump if equal (ZF=1) | |
| jne label ; Jump if not equal (ZF=0) | |
| jc label ; Jump if carry (CF=1) | |
| jnc label ; Jump if not carry (CF=0) | |
| ; Based on comparisons (signed) | |
| jl label ; Jump if less | |
| jle label ; Jump if less or equal | |
| jg label ; Jump if greater | |
| jge label ; Jump if greater or equal | |
| ; Based on comparisons (unsigned) | |
| jb label ; Jump if below | |
| jbe label ; Jump if below or equal | |
| ja label ; Jump if above | |
| jae label ; Jump if above or equal | |
| ; Special conditions | |
| jo label ; Jump if overflow | |
| js label ; Jump if sign (negative) | |
| jp label ; Jump if parity even | |
| jcxz label ; Jump if CX/ECX/RCX is zero</code></pre> | |
| <h4 id="loop-instructions"><strong>Loop Instructions</strong></h4> | |
| <pre class="assembly"><code>; Traditional loop instructions (slower on modern CPUs) | |
| mov rcx, 100 | |
| .loop: | |
| ; ... loop body ... | |
| loop .loop ; Decrement RCX and jump if non-zero | |
| ; Preferred pattern for modern CPUs | |
| mov rcx, 100 | |
| .loop: | |
| ; ... loop body ... | |
| dec rcx | |
| jnz .loop | |
| ; String operation loops | |
| mov rcx, string_length | |
| rep movsb ; Repeat MOVSB RCX times</code></pre> | |
| <h4 id="compiler-control-flow-patterns"><strong>Compiler Control Flow | |
| Patterns</strong></h4> | |
| <div class="sourceCode" id="cb37"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb37-1"><a href="#cb37-1" aria-hidden="true" tabindex="-1"></a><span class="co">// If-else pattern</span></span> | |
| <span id="cb37-2"><a href="#cb37-2" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> <span class="op">(</span>a <span class="op"><</span> b<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb37-3"><a href="#cb37-3" aria-hidden="true" tabindex="-1"></a> x <span class="op">=</span> y<span class="op">;</span></span> | |
| <span id="cb37-4"><a href="#cb37-4" aria-hidden="true" tabindex="-1"></a><span class="op">}</span> <span class="cf">else</span> <span class="op">{</span></span> | |
| <span id="cb37-5"><a href="#cb37-5" aria-hidden="true" tabindex="-1"></a> x <span class="op">=</span> z<span class="op">;</span></span> | |
| <span id="cb37-6"><a href="#cb37-6" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb37-7"><a href="#cb37-7" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb37-8"><a href="#cb37-8" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler generates:</span></span> | |
| <span id="cb37-9"><a href="#cb37-9" aria-hidden="true" tabindex="-1"></a><span class="co">// cmp rdi, rsi</span></span> | |
| <span id="cb37-10"><a href="#cb37-10" aria-hidden="true" tabindex="-1"></a><span class="co">// jge .else</span></span> | |
| <span id="cb37-11"><a href="#cb37-11" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rax, rdx</span></span> | |
| <span id="cb37-12"><a href="#cb37-12" aria-hidden="true" tabindex="-1"></a><span class="co">// jmp .end</span></span> | |
| <span id="cb37-13"><a href="#cb37-13" aria-hidden="true" tabindex="-1"></a><span class="co">// .else:</span></span> | |
| <span id="cb37-14"><a href="#cb37-14" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rax, rcx</span></span> | |
| <span id="cb37-15"><a href="#cb37-15" aria-hidden="true" tabindex="-1"></a><span class="co">// .end:</span></span> | |
| <span id="cb37-16"><a href="#cb37-16" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb37-17"><a href="#cb37-17" aria-hidden="true" tabindex="-1"></a><span class="co">// Switch statement (jump table)</span></span> | |
| <span id="cb37-18"><a href="#cb37-18" aria-hidden="true" tabindex="-1"></a><span class="cf">switch</span> <span class="op">(</span>x<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb37-19"><a href="#cb37-19" aria-hidden="true" tabindex="-1"></a> <span class="cf">case</span> <span class="dv">0</span><span class="op">:</span> <span class="cf">return</span> a<span class="op">;</span></span> | |
| <span id="cb37-20"><a href="#cb37-20" aria-hidden="true" tabindex="-1"></a> <span class="cf">case</span> <span class="dv">1</span><span class="op">:</span> <span class="cf">return</span> b<span class="op">;</span></span> | |
| <span id="cb37-21"><a href="#cb37-21" aria-hidden="true" tabindex="-1"></a> <span class="cf">case</span> <span class="dv">2</span><span class="op">:</span> <span class="cf">return</span> c<span class="op">;</span></span> | |
| <span id="cb37-22"><a href="#cb37-22" aria-hidden="true" tabindex="-1"></a> <span class="cf">default</span><span class="op">:</span> <span class="cf">return</span> d<span class="op">;</span></span> | |
| <span id="cb37-23"><a href="#cb37-23" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb37-24"><a href="#cb37-24" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb37-25"><a href="#cb37-25" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler may generate:</span></span> | |
| <span id="cb37-26"><a href="#cb37-26" aria-hidden="true" tabindex="-1"></a><span class="co">// cmp edi, 2</span></span> | |
| <span id="cb37-27"><a href="#cb37-27" aria-hidden="true" tabindex="-1"></a><span class="co">// ja .default</span></span> | |
| <span id="cb37-28"><a href="#cb37-28" aria-hidden="true" tabindex="-1"></a><span class="co">// lea rax, [rip + .jump_table]</span></span> | |
| <span id="cb37-29"><a href="#cb37-29" aria-hidden="true" tabindex="-1"></a><span class="co">// movsxd rdi, dword [rax + rdi*4]</span></span> | |
| <span id="cb37-30"><a href="#cb37-30" aria-hidden="true" tabindex="-1"></a><span class="co">// add rax, rdi</span></span> | |
| <span id="cb37-31"><a href="#cb37-31" aria-hidden="true" tabindex="-1"></a><span class="co">// jmp rax</span></span> | |
| <span id="cb37-32"><a href="#cb37-32" aria-hidden="true" tabindex="-1"></a><span class="co">// .jump_table:</span></span> | |
| <span id="cb37-33"><a href="#cb37-33" aria-hidden="true" tabindex="-1"></a><span class="co">// dd .case0 - .jump_table</span></span> | |
| <span id="cb37-34"><a href="#cb37-34" aria-hidden="true" tabindex="-1"></a><span class="co">// dd .case1 - .jump_table</span></span> | |
| <span id="cb37-35"><a href="#cb37-35" aria-hidden="true" tabindex="-1"></a><span class="co">// dd .case2 - .jump_table</span></span></code></pre></div> | |
| <h4 id="branch-prediction-considerations"><strong>Branch Prediction | |
| Considerations</strong></h4> | |
| <pre class="assembly"><code>; Predictable branches (favor forward not-taken, backward taken) | |
| .loop: | |
| ; ... work ... | |
| dec rcx | |
| jnz .loop ; Backward branch, predicted taken | |
| test rax, rax | |
| jz .skip ; Forward branch, predicted not-taken | |
| ; ... common case ... | |
| .skip: | |
| ; Branch hints (legacy, mostly ignored by modern CPUs) | |
| jz .unlikely ; 3E prefix for "not taken" hint | |
| 2E jnz .likely ; 2E prefix for "taken" hint</code></pre> | |
| <h3 id="string-operations"><strong>2.6 String Operations</strong></h3> | |
| <h4 id="basic-string-instructions"><strong>Basic String | |
| Instructions</strong></h4> | |
| <pre class="assembly"><code>; String move operations | |
| movsb ; Move byte [RSI] to [RDI], adjust pointers | |
| movsw ; Move word | |
| movsd ; Move dword | |
| movsq ; Move qword | |
| ; String compare | |
| cmpsb ; Compare bytes at [RSI] and [RDI] | |
| ; String scan | |
| scasb ; Compare AL with [RDI] | |
| ; String store | |
| stosb ; Store AL at [RDI] | |
| ; String load | |
| lodsb ; Load [RSI] into AL | |
| ; Direction flag controls pointer adjustment | |
| cld ; Clear DF: increment pointers | |
| std ; Set DF: decrement pointers</code></pre> | |
| <h4 id="rep-prefixes"><strong>REP Prefixes</strong></h4> | |
| <pre class="assembly"><code>; Repeat string operations | |
| mov rcx, 1000 | |
| rep movsb ; Copy RCX bytes | |
| mov rcx, 1000 | |
| mov al, 0 | |
| rep stosb ; Fill RCX bytes with zero | |
| ; Conditional repeats | |
| mov rcx, 1000 | |
| repne scasb ; Scan while not equal | |
| ; RCX now contains remaining count | |
| mov rcx, 1000 | |
| repe cmpsb ; Compare while equal</code></pre> | |
| <h4 id="optimized-string-operations"><strong>Optimized String | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; Fast memory copy pattern | |
| memcpy: | |
| mov rax, rdi ; Save destination | |
| cmp rdx, 32 | |
| jb .small | |
| ; Large copy with SIMD | |
| .large_loop: | |
| movdqa xmm0, [rsi] | |
| movdqa xmm1, [rsi+16] | |
| movdqa [rdi], xmm0 | |
| movdqa [rdi+16], xmm1 | |
| add rsi, 32 | |
| add rdi, 32 | |
| sub rdx, 32 | |
| cmp rdx, 32 | |
| jae .large_loop | |
| .small: | |
| ; Handle remaining bytes | |
| test rdx, rdx | |
| jz .done | |
| rep movsb | |
| .done: | |
| ret</code></pre> | |
| <h4 id="compiler-string-intrinsics"><strong>Compiler String | |
| Intrinsics</strong></h4> | |
| <div class="sourceCode" id="cb42"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb42-1"><a href="#cb42-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler recognition of patterns</span></span> | |
| <span id="cb42-2"><a href="#cb42-2" aria-hidden="true" tabindex="-1"></a><span class="dt">void</span><span class="op">*</span> memset_pattern<span class="op">(</span><span class="dt">void</span><span class="op">*</span> s<span class="op">,</span> <span class="dt">int</span> c<span class="op">,</span> <span class="dt">size_t</span> n<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb42-3"><a href="#cb42-3" aria-hidden="true" tabindex="-1"></a> <span class="co">// Compiler may replace with:</span></span> | |
| <span id="cb42-4"><a href="#cb42-4" aria-hidden="true" tabindex="-1"></a> <span class="co">// mov rax, rdi</span></span> | |
| <span id="cb42-5"><a href="#cb42-5" aria-hidden="true" tabindex="-1"></a> <span class="co">// movzx esi, sil</span></span> | |
| <span id="cb42-6"><a href="#cb42-6" aria-hidden="true" tabindex="-1"></a> <span class="co">// mov rcx, rdx</span></span> | |
| <span id="cb42-7"><a href="#cb42-7" aria-hidden="true" tabindex="-1"></a> <span class="co">// rep stosb</span></span> | |
| <span id="cb42-8"><a href="#cb42-8" aria-hidden="true" tabindex="-1"></a> <span class="co">// ret</span></span> | |
| <span id="cb42-9"><a href="#cb42-9" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb42-10"><a href="#cb42-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">unsigned</span> <span class="dt">char</span><span class="op">*</span> p <span class="op">=</span> <span class="op">(</span><span class="dt">unsigned</span> <span class="dt">char</span><span class="op">*)</span>s<span class="op">;</span></span> | |
| <span id="cb42-11"><a href="#cb42-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">while</span> <span class="op">(</span>n<span class="op">--)</span> <span class="op">*</span>p<span class="op">++</span> <span class="op">=</span> c<span class="op">;</span></span> | |
| <span id="cb42-12"><a href="#cb42-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> s<span class="op">;</span></span> | |
| <span id="cb42-13"><a href="#cb42-13" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb42-14"><a href="#cb42-14" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb42-15"><a href="#cb42-15" aria-hidden="true" tabindex="-1"></a><span class="co">// Modern compilers optimize to:</span></span> | |
| <span id="cb42-16"><a href="#cb42-16" aria-hidden="true" tabindex="-1"></a><span class="co">// - REP STOSB for small sizes</span></span> | |
| <span id="cb42-17"><a href="#cb42-17" aria-hidden="true" tabindex="-1"></a><span class="co">// - SIMD loops for large sizes</span></span> | |
| <span id="cb42-18"><a href="#cb42-18" aria-hidden="true" tabindex="-1"></a><span class="co">// - Non-temporal stores for very large sizes</span></span></code></pre></div> | |
| <h3 id="compiler-perspective-instruction-selection-patterns"><strong>2.7 | |
| Compiler Perspective: Instruction Selection Patterns</strong></h3> | |
| <h4 id="instruction-selection-overview"><strong>Instruction Selection | |
| Overview</strong></h4> | |
| <p>Modern compilers use pattern matching to select optimal | |
| instructions:</p> | |
| <div class="sourceCode" id="cb43"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb43-1"><a href="#cb43-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler's instruction selection process</span></span> | |
| <span id="cb43-2"><a href="#cb43-2" aria-hidden="true" tabindex="-1"></a>class InstructionSelector <span class="op">{</span></span> | |
| <span id="cb43-3"><a href="#cb43-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">void</span> select<span class="op">(</span>IR_Node<span class="op">*</span> node<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb43-4"><a href="#cb43-4" aria-hidden="true" tabindex="-1"></a> <span class="cf">switch</span> <span class="op">(</span>node<span class="op">-></span>type<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb43-5"><a href="#cb43-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">case</span> IR_ADD<span class="op">:</span></span> | |
| <span id="cb43-6"><a href="#cb43-6" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="op">(</span>is_constant<span class="op">(</span>node<span class="op">-></span>right<span class="op">,</span> <span class="dv">1</span><span class="op">))</span></span> | |
| <span id="cb43-7"><a href="#cb43-7" aria-hidden="true" tabindex="-1"></a> emit_inc<span class="op">(</span>node<span class="op">-></span>left<span class="op">);</span></span> | |
| <span id="cb43-8"><a href="#cb43-8" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span> <span class="cf">if</span> <span class="op">(</span>is_lea_candidate<span class="op">(</span>node<span class="op">))</span></span> | |
| <span id="cb43-9"><a href="#cb43-9" aria-hidden="true" tabindex="-1"></a> emit_lea<span class="op">(</span>node<span class="op">);</span></span> | |
| <span id="cb43-10"><a href="#cb43-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span></span> | |
| <span id="cb43-11"><a href="#cb43-11" aria-hidden="true" tabindex="-1"></a> emit_add<span class="op">(</span>node<span class="op">);</span></span> | |
| <span id="cb43-12"><a href="#cb43-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">break</span><span class="op">;</span></span> | |
| <span id="cb43-13"><a href="#cb43-13" aria-hidden="true" tabindex="-1"></a> </span> | |
| <span id="cb43-14"><a href="#cb43-14" aria-hidden="true" tabindex="-1"></a> <span class="cf">case</span> IR_MULTIPLY<span class="op">:</span></span> | |
| <span id="cb43-15"><a href="#cb43-15" aria-hidden="true" tabindex="-1"></a> <span class="cf">if</span> <span class="op">(</span>is_power_of_two<span class="op">(</span>node<span class="op">-></span>right<span class="op">))</span></span> | |
| <span id="cb43-16"><a href="#cb43-16" aria-hidden="true" tabindex="-1"></a> emit_shift<span class="op">(</span>node<span class="op">);</span></span> | |
| <span id="cb43-17"><a href="#cb43-17" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span> <span class="cf">if</span> <span class="op">(</span>is_lea_multiply<span class="op">(</span>node<span class="op">))</span></span> | |
| <span id="cb43-18"><a href="#cb43-18" aria-hidden="true" tabindex="-1"></a> emit_lea<span class="op">(</span>node<span class="op">);</span></span> | |
| <span id="cb43-19"><a href="#cb43-19" aria-hidden="true" tabindex="-1"></a> <span class="cf">else</span></span> | |
| <span id="cb43-20"><a href="#cb43-20" aria-hidden="true" tabindex="-1"></a> emit_imul<span class="op">(</span>node<span class="op">);</span></span> | |
| <span id="cb43-21"><a href="#cb43-21" aria-hidden="true" tabindex="-1"></a> <span class="cf">break</span><span class="op">;</span></span> | |
| <span id="cb43-22"><a href="#cb43-22" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span></span> | |
| <span id="cb43-23"><a href="#cb43-23" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span></span> | |
| <span id="cb43-24"><a href="#cb43-24" aria-hidden="true" tabindex="-1"></a><span class="op">};</span></span></code></pre></div> | |
| <h4 id="common-optimization-patterns"><strong>Common Optimization | |
| Patterns</strong></h4> | |
| <pre class="assembly"><code>; Strength reduction | |
| ; Multiply by constant → LEA/shift | |
| ; x * 5 becomes: | |
| lea rax, [rdi + rdi*4] | |
| ; x * 100 becomes: | |
| lea rax, [rdi + rdi*4] ; x * 5 | |
| lea rax, [rax + rax*4] ; x * 25 | |
| shl rax, 2 ; x * 100 | |
| ; Division by constant → multiply by reciprocal | |
| ; x / 10 becomes (for unsigned): | |
| mov rax, 0xCCCCCCCCCCCCCCCD ; Reciprocal constant | |
| mul rdi | |
| shr rdx, 3 ; Result in RDX | |
| ; Conditional to branchless | |
| ; x = (a < b) ? c : d becomes: | |
| cmp rdi, rsi | |
| mov rax, r8 ; d | |
| cmovl rax, rdx ; c if less</code></pre> | |
| <h4 id="peephole-optimizations"><strong>Peephole | |
| Optimizations</strong></h4> | |
| <pre class="assembly"><code>; Before optimization: | |
| mov rax, 0 | |
| add rax, rbx | |
| ; After: Eliminate redundant move | |
| mov rax, rbx | |
| ; Before: | |
| cmp rax, 0 | |
| je label | |
| ; After: Use TEST for zero comparison | |
| test rax, rax | |
| je label | |
| ; Before: | |
| mov [rsp+8], rax | |
| mov rbx, [rsp+8] | |
| ; After: Eliminate store-load | |
| mov [rsp+8], rax | |
| mov rbx, rax</code></pre> | |
| <h4 id="code-generation-examples"><strong>Code Generation | |
| Examples</strong></h4> | |
| <div class="sourceCode" id="cb46"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb46-1"><a href="#cb46-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Structure field access</span></span> | |
| <span id="cb46-2"><a href="#cb46-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Point <span class="op">{</span> <span class="dt">long</span> x<span class="op">,</span> y<span class="op">,</span> z<span class="op">;</span> <span class="op">};</span></span> | |
| <span id="cb46-3"><a href="#cb46-3" aria-hidden="true" tabindex="-1"></a><span class="dt">long</span> get_y<span class="op">(</span>Point<span class="op">*</span> p<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb46-4"><a href="#cb46-4" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> p<span class="op">-></span>y<span class="op">;</span></span> | |
| <span id="cb46-5"><a href="#cb46-5" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb46-6"><a href="#cb46-6" aria-hidden="true" tabindex="-1"></a><span class="co">// Generates:</span></span> | |
| <span id="cb46-7"><a href="#cb46-7" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rax, [rdi + 8]</span></span> | |
| <span id="cb46-8"><a href="#cb46-8" aria-hidden="true" tabindex="-1"></a><span class="co">// ret</span></span> | |
| <span id="cb46-9"><a href="#cb46-9" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb46-10"><a href="#cb46-10" aria-hidden="true" tabindex="-1"></a><span class="co">// Array indexing</span></span> | |
| <span id="cb46-11"><a href="#cb46-11" aria-hidden="true" tabindex="-1"></a><span class="dt">long</span> array_access<span class="op">(</span><span class="dt">long</span><span class="op">*</span> arr<span class="op">,</span> <span class="dt">long</span> idx<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb46-12"><a href="#cb46-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> arr<span class="op">[</span>idx<span class="op">];</span></span> | |
| <span id="cb46-13"><a href="#cb46-13" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb46-14"><a href="#cb46-14" aria-hidden="true" tabindex="-1"></a><span class="co">// Generates:</span></span> | |
| <span id="cb46-15"><a href="#cb46-15" aria-hidden="true" tabindex="-1"></a><span class="co">// mov rax, [rdi + rsi*8]</span></span> | |
| <span id="cb46-16"><a href="#cb46-16" aria-hidden="true" tabindex="-1"></a><span class="co">// ret</span></span> | |
| <span id="cb46-17"><a href="#cb46-17" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb46-18"><a href="#cb46-18" aria-hidden="true" tabindex="-1"></a><span class="co">// Complex expression</span></span> | |
| <span id="cb46-19"><a href="#cb46-19" aria-hidden="true" tabindex="-1"></a><span class="dt">long</span> expr<span class="op">(</span><span class="dt">long</span> a<span class="op">,</span> <span class="dt">long</span> b<span class="op">,</span> <span class="dt">long</span> c<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb46-20"><a href="#cb46-20" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> <span class="op">(</span>a <span class="op">+</span> b<span class="op">)</span> <span class="op">*</span> c <span class="op">-</span> <span class="op">(</span>a <span class="op"><<</span> <span class="dv">3</span><span class="op">);</span></span> | |
| <span id="cb46-21"><a href="#cb46-21" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb46-22"><a href="#cb46-22" aria-hidden="true" tabindex="-1"></a><span class="co">// Generates:</span></span> | |
| <span id="cb46-23"><a href="#cb46-23" aria-hidden="true" tabindex="-1"></a><span class="co">// lea rax, [rdi + rsi] ; a + b</span></span> | |
| <span id="cb46-24"><a href="#cb46-24" aria-hidden="true" tabindex="-1"></a><span class="co">// imul rax, rdx ; * c</span></span> | |
| <span id="cb46-25"><a href="#cb46-25" aria-hidden="true" tabindex="-1"></a><span class="co">// lea rcx, [rdi*8] ; a << 3</span></span> | |
| <span id="cb46-26"><a href="#cb46-26" aria-hidden="true" tabindex="-1"></a><span class="co">// sub rax, rcx ; final result</span></span> | |
| <span id="cb46-27"><a href="#cb46-27" aria-hidden="true" tabindex="-1"></a><span class="co">// ret</span></span></code></pre></div> | |
| <h4 id="compiler-instruction-costs"><strong>Compiler Instruction | |
| Costs</strong></h4> | |
| <div class="sourceCode" id="cb47"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb47-1"><a href="#cb47-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Simplified cost model for instruction selection</span></span> | |
| <span id="cb47-2"><a href="#cb47-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> InsnCost <span class="op">{</span></span> | |
| <span id="cb47-3"><a href="#cb47-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> latency<span class="op">;</span> <span class="co">// Cycles to produce result</span></span> | |
| <span id="cb47-4"><a href="#cb47-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> throughput<span class="op">;</span> <span class="co">// Inverse throughput</span></span> | |
| <span id="cb47-5"><a href="#cb47-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> size<span class="op">;</span> <span class="co">// Encoding size</span></span> | |
| <span id="cb47-6"><a href="#cb47-6" aria-hidden="true" tabindex="-1"></a><span class="op">};</span></span> | |
| <span id="cb47-7"><a href="#cb47-7" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb47-8"><a href="#cb47-8" aria-hidden="true" tabindex="-1"></a>InsnCost costs<span class="op">[]</span> <span class="op">=</span> <span class="op">{</span></span> | |
| <span id="cb47-9"><a href="#cb47-9" aria-hidden="true" tabindex="-1"></a> <span class="op">{</span><span class="st">"mov r,r"</span><span class="op">,</span> <span class="dv">0</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">2</span><span class="op">},</span> <span class="co">// Zero latency (move elimination)</span></span> | |
| <span id="cb47-10"><a href="#cb47-10" aria-hidden="true" tabindex="-1"></a> <span class="op">{</span><span class="st">"add r,r"</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">3</span><span class="op">},</span></span> | |
| <span id="cb47-11"><a href="#cb47-11" aria-hidden="true" tabindex="-1"></a> <span class="op">{</span><span class="st">"lea simple"</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">3</span><span class="op">},</span></span> | |
| <span id="cb47-12"><a href="#cb47-12" aria-hidden="true" tabindex="-1"></a> <span class="op">{</span><span class="st">"lea complex"</span><span class="op">,</span> <span class="dv">3</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">4</span><span class="op">},</span> <span class="co">// 3-component LEA</span></span> | |
| <span id="cb47-13"><a href="#cb47-13" aria-hidden="true" tabindex="-1"></a> <span class="op">{</span><span class="st">"imul r,r"</span><span class="op">,</span> <span class="dv">3</span><span class="op">,</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">3</span><span class="op">},</span></span> | |
| <span id="cb47-14"><a href="#cb47-14" aria-hidden="true" tabindex="-1"></a> <span class="op">{</span><span class="st">"div"</span><span class="op">,</span> <span class="dv">20</span><span class="op">,</span> <span class="dv">20</span><span class="op">,</span> <span class="dv">2</span><span class="op">},</span> <span class="co">// Very expensive</span></span> | |
| <span id="cb47-15"><a href="#cb47-15" aria-hidden="true" tabindex="-1"></a><span class="op">};</span></span> | |
| <span id="cb47-16"><a href="#cb47-16" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb47-17"><a href="#cb47-17" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler chooses based on optimization goals:</span></span> | |
| <span id="cb47-18"><a href="#cb47-18" aria-hidden="true" tabindex="-1"></a><span class="co">// -Os: Minimize size</span></span> | |
| <span id="cb47-19"><a href="#cb47-19" aria-hidden="true" tabindex="-1"></a><span class="co">// -O2: Balance latency/throughput</span></span> | |
| <span id="cb47-20"><a href="#cb47-20" aria-hidden="true" tabindex="-1"></a><span class="co">// -O3: Aggressive optimization</span></span></code></pre></div> | |
| <p>This foundation in instruction encoding and core operations provides | |
| the basis for understanding how compilers transform high-level code into | |
| efficient x86-64 machine code. The next chapter will explore memory | |
| architecture and addressing modes in greater detail.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-3-memory-architecture-and-addressing-modes"><strong>Chapter | |
| 3: Memory Architecture and Addressing Modes</strong></h2> | |
| <h3 id="x86-64-memory-organization"><strong>3.1 x86-64 Memory | |
| Organization</strong></h3> | |
| <h4 id="virtual-address-space-layout"><strong>Virtual Address Space | |
| Layout</strong></h4> | |
| <p>The x86-64 architecture provides a 64-bit virtual address space, | |
| though current implementations use only 48-57 bits:</p> | |
| <p>Canonical 48-bit Address Space: 0x0000000000000000 - | |
| 0x00007FFFFFFFFFFF User space (128 TB) 0x0000800000000000 - | |
| 0xFFFF7FFFFFFFFFFF Non-canonical (invalid) 0xFFFF800000000000 - | |
| 0xFFFFFFFFFFFFFFFF Kernel space (128 TB)</p> | |
| <p>With 57-bit addressing (Intel LA57): 0x0000000000000000 - | |
| 0x00FFFFFFFFFFFFFF User space (64 PB) 0x0100000000000000 - | |
| 0xFEFFFFFFFFFFFFFF Non-canonical 0xFF00000000000000 - 0xFFFFFFFFFFFFFFFF | |
| Kernel space (64 PB)</p> | |
| <h4 id="memory-segmentation-in-64-bit-mode-1"><strong>Memory | |
| Segmentation in 64-bit Mode</strong></h4> | |
| <p>While segmentation is largely disabled in 64-bit mode, some aspects | |
| remain:</p> | |
| <pre class="assembly"><code>; Segment registers in 64-bit mode | |
| ; CS, DS, ES, SS - Base forced to 0, limits ignored | |
| ; FS, GS - Base addresses can be set via MSRs | |
| ; Thread-local storage using FS/GS | |
| mov rax, fs:[0] ; Read thread-local variable | |
| mov rax, gs:[0x10] ; Access per-CPU data (kernel) | |
| ; Setting FS/GS base | |
| mov ecx, 0xC0000100 ; FS_BASE MSR | |
| mov eax, edi ; Low 32 bits | |
| mov edx, esi ; High 32 bits | |
| wrmsr</code></pre> | |
| <h4 id="page-table-structure"><strong>Page Table Structure</strong></h4> | |
| <p>Modern x86-64 uses 4 or 5-level paging:</p> | |
| <div class="sourceCode" id="cb49"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb49-1"><a href="#cb49-1" aria-hidden="true" tabindex="-1"></a><span class="co">// 4-level paging structure (48-bit addresses)</span></span> | |
| <span id="cb49-2"><a href="#cb49-2" aria-hidden="true" tabindex="-1"></a><span class="kw">typedef</span> <span class="kw">struct</span> <span class="op">{</span></span> | |
| <span id="cb49-3"><a href="#cb49-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint64_t</span> entries<span class="op">[</span><span class="dv">512</span><span class="op">];</span></span> | |
| <span id="cb49-4"><a href="#cb49-4" aria-hidden="true" tabindex="-1"></a><span class="op">}</span> PageTable<span class="op">;</span></span> | |
| <span id="cb49-5"><a href="#cb49-5" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb49-6"><a href="#cb49-6" aria-hidden="true" tabindex="-1"></a><span class="co">// Virtual address breakdown (4-level):</span></span> | |
| <span id="cb49-7"><a href="#cb49-7" aria-hidden="true" tabindex="-1"></a><span class="co">// Bits 47:39 - PML4 index (9 bits)</span></span> | |
| <span id="cb49-8"><a href="#cb49-8" aria-hidden="true" tabindex="-1"></a><span class="co">// Bits 38:30 - PDPT index (9 bits)</span></span> | |
| <span id="cb49-9"><a href="#cb49-9" aria-hidden="true" tabindex="-1"></a><span class="co">// Bits 29:21 - PD index (9 bits)</span></span> | |
| <span id="cb49-10"><a href="#cb49-10" aria-hidden="true" tabindex="-1"></a><span class="co">// Bits 20:12 - PT index (9 bits)</span></span> | |
| <span id="cb49-11"><a href="#cb49-11" aria-hidden="true" tabindex="-1"></a><span class="co">// Bits 11:0 - Page offset (12 bits)</span></span> | |
| <span id="cb49-12"><a href="#cb49-12" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb49-13"><a href="#cb49-13" aria-hidden="true" tabindex="-1"></a><span class="co">// Page table entry format</span></span> | |
| <span id="cb49-14"><a href="#cb49-14" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_PRESENT </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">0</span><span class="op">)</span></span> | |
| <span id="cb49-15"><a href="#cb49-15" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_WRITABLE </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">1</span><span class="op">)</span></span> | |
| <span id="cb49-16"><a href="#cb49-16" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_USER </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">2</span><span class="op">)</span></span> | |
| <span id="cb49-17"><a href="#cb49-17" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_PWT </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">3</span><span class="op">)</span></span> | |
| <span id="cb49-18"><a href="#cb49-18" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_PCD </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">4</span><span class="op">)</span></span> | |
| <span id="cb49-19"><a href="#cb49-19" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_ACCESSED </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">5</span><span class="op">)</span></span> | |
| <span id="cb49-20"><a href="#cb49-20" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_DIRTY </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">6</span><span class="op">)</span></span> | |
| <span id="cb49-21"><a href="#cb49-21" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_HUGE </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">7</span><span class="op">)</span><span class="pp"> </span><span class="co">// PS bit</span></span> | |
| <span id="cb49-22"><a href="#cb49-22" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_GLOBAL </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">8</span><span class="op">)</span></span> | |
| <span id="cb49-23"><a href="#cb49-23" aria-hidden="true" tabindex="-1"></a><span class="pp">#define PTE_NX </span><span class="op">(</span><span class="dv">1</span><span class="bu">ULL</span><span class="pp"> </span><span class="op"><<</span><span class="pp"> </span><span class="dv">63</span><span class="op">)</span></span></code></pre></div> | |
| <h4 id="memory-types-and-caching"><strong>Memory Types and | |
| Caching</strong></h4> | |
| <pre class="assembly"><code>; Memory types (set via PAT/MTRR) | |
| ; UC - Uncacheable | |
| ; WC - Write Combining | |
| ; WT - Write Through | |
| ; WP - Write Protected | |
| ; WB - Write Back (normal cacheable) | |
| ; Cache control instructions | |
| clflush [rax] ; Flush cache line | |
| clflushopt [rax] ; Optimized flush | |
| clwb [rax] ; Write back without invalidate | |
| ; Memory fences | |
| mfence ; Full memory fence | |
| sfence ; Store fence | |
| lfence ; Load fence</code></pre> | |
| <h3 id="complex-addressing-modes"><strong>3.2 Complex Addressing | |
| Modes</strong></h3> | |
| <h4 id="general-addressing-mode-format"><strong>General Addressing Mode | |
| Format</strong></h4> | |
| <p>x86-64 supports the flexible addressing mode: [base + index*scale + | |
| displacement]</p> | |
| <p>Where:</p> | |
| <ul> | |
| <li><p>base: any general-purpose register</p></li> | |
| <li><p>index: any GPR except RSP</p></li> | |
| <li><p>scale: 1, 2, 4, or 8</p></li> | |
| <li><p>displacement: 0, 8-bit, or 32-bit signed</p></li> | |
| </ul> | |
| <h4 id="addressing-mode-examples"><strong>Addressing Mode | |
| Examples</strong></h4> | |
| <pre class="assembly"><code>; Direct addressing | |
| mov rax, [0x1000] ; Absolute address (rare in 64-bit) | |
| mov rax, [label] ; RIP-relative (preferred) | |
| ; Register indirect | |
| mov rax, [rbx] ; [base] | |
| ; Displacement | |
| mov rax, [rbx + 8] ; [base + disp8] | |
| mov rax, [rbx + 1000] ; [base + disp32] | |
| ; Scaled index | |
| mov rax, [rbx + rcx*8] ; [base + index*scale] | |
| ; Full addressing mode | |
| mov rax, [rbx + rcx*8 + 16] ; [base + index*scale + disp] | |
| ; Special cases | |
| mov rax, [rcx*2 + 100] ; [index*scale + disp] - no base | |
| mov rax, [rsp + 8] ; RSP requires SIB byte</code></pre> | |
| <h4 id="rip-relative-addressing"><strong>RIP-Relative | |
| Addressing</strong></h4> | |
| <p>RIP-relative addressing is crucial for position-independent code:</p> | |
| <pre class="assembly"><code>; RIP-relative data access | |
| data: dq 0x123456789ABCDEF0 | |
| func: | |
| mov rax, [rip + data] ; Load from data | |
| lea rbx, [rip + data] ; Get address of data | |
| ; Compiler-generated RIP-relative | |
| ; C code: extern int global_var; | |
| ; int x = global_var; | |
| ; Generates: | |
| mov eax, [rip + global_var@GOTPCREL] ; Via GOT | |
| ; or | |
| mov eax, [rip + global_var] ; Direct</code></pre> | |
| <h4 id="addressing-mode-encoding"><strong>Addressing Mode | |
| Encoding</strong></h4> | |
| <pre class="assembly"><code>; ModR/M byte: [mod][reg][r/m] | |
| ; mod: 00 = no disp, 01 = disp8, 10 = disp32, 11 = register | |
| ; reg: register operand or opcode extension | |
| ; r/m: register or memory operand | |
| ; SIB byte: [scale][index][base] | |
| ; Required when: | |
| ; - Using RSP as base | |
| ; - Using scaled index | |
| ; - Using [*] addressing | |
| ; Examples with encoding details: | |
| mov rax, [rbx] ; ModR/M: 03 (no SIB needed) | |
| mov rax, [rsp] ; ModR/M: 04, SIB: 24 | |
| mov rax, [rbx + rcx*8] ; ModR/M: 04, SIB: CB</code></pre> | |
| <h3 id="memory-access-patterns-and-optimization"><strong>3.3 Memory | |
| Access Patterns and Optimization</strong></h3> | |
| <h4 id="cache-friendly-access-patterns"><strong>Cache-Friendly Access | |
| Patterns</strong></h4> | |
| <pre class="assembly"><code>; Sequential access (prefetcher-friendly) | |
| process_array: | |
| xor rdx, rdx ; Sum | |
| .loop: | |
| add rdx, [rdi] ; Sequential read | |
| add rdi, 8 | |
| dec rsi | |
| jnz .loop | |
| ; Strided access (less efficient) | |
| process_strided: | |
| xor rdx, rdx | |
| .loop: | |
| add rdx, [rdi] | |
| add rdi, 64 ; 8 cache lines stride | |
| dec rsi | |
| jnz .loop</code></pre> | |
| <h4 id="prefetching"><strong>Prefetching</strong></h4> | |
| <pre class="assembly"><code>; Software prefetch instructions | |
| prefetchnta [rax] ; Non-temporal (bypass cache) | |
| prefetcht0 [rax] ; To L1 cache | |
| prefetcht1 [rax] ; To L2 cache | |
| prefetcht2 [rax] ; To L3 cache | |
| prefetchw [rax] ; For write | |
| ; Compiler prefetch pattern | |
| process_with_prefetch: | |
| mov rcx, rsi | |
| sub rcx, 8 ; Prefetch 8 iterations ahead | |
| .loop: | |
| prefetcht0 [rdi + 64] ; Prefetch next cache line | |
| ; Process current data | |
| movaps xmm0, [rdi] | |
| movaps xmm1, [rdi + 16] | |
| ; ... processing ... | |
| add rdi, 64 | |
| dec rsi | |
| jnz .loop</code></pre> | |
| <h4 id="non-temporal-memory-access"><strong>Non-Temporal Memory | |
| Access</strong></h4> | |
| <pre class="assembly"><code>; Non-temporal stores (bypass cache) | |
| movnti [rax], rbx ; NT store integer | |
| movntdq [rax], xmm0 ; NT store 128-bit | |
| movntpd [rax], xmm0 ; NT store packed double | |
| ; Non-temporal loads (SSE4.1) | |
| movntdqa xmm0, [rax] ; NT load 128-bit | |
| ; Example: Large memory copy bypassing cache | |
| large_memcpy: | |
| .loop: | |
| movdqa xmm0, [rsi] | |
| movdqa xmm1, [rsi + 16] | |
| movdqa xmm2, [rsi + 32] | |
| movdqa xmm3, [rsi + 48] | |
| movntdq [rdi], xmm0 | |
| movntdq [rdi + 16], xmm1 | |
| movntdq [rdi + 32], xmm2 | |
| movntdq [rdi + 48], xmm3 | |
| add rsi, 64 | |
| add rdi, 64 | |
| sub rdx, 64 | |
| jnz .loop | |
| sfence ; Ensure completion | |
| ret</code></pre> | |
| <h3 id="stack-operations-and-management"><strong>3.4 Stack Operations | |
| and Management</strong></h3> | |
| <h4 id="stack-frame-layout"><strong>Stack Frame Layout</strong></h4> | |
| <pre class="assembly"><code>; Typical stack frame structure | |
| ; Higher addresses | |
| ; ... | |
| ; [rbp + 24] - Argument 8 (if passed on stack) | |
| ; [rbp + 16] - Argument 7 (if passed on stack) | |
| ; [rbp + 8] - Return address | |
| ; [rbp + 0] - Saved RBP (frame pointer) | |
| ; [rbp - 8] - Local variable 1 | |
| ; [rbp - 16] - Local variable 2 | |
| ; [rsp] - Top of stack | |
| ; Lower addresses | |
| ; Function prologue | |
| function: | |
| push rbp ; Save frame pointer | |
| mov rbp, rsp ; Establish frame | |
| sub rsp, 32 ; Allocate locals | |
| ; Function epilogue | |
| mov rsp, rbp ; Restore stack | |
| pop rbp ; Restore frame pointer | |
| ret</code></pre> | |
| <h4 id="stack-alignment"><strong>Stack Alignment</strong></h4> | |
| <pre class="assembly"><code>; System V AMD64 ABI requires 16-byte alignment before CALL | |
| align_stack: | |
| test rsp, 15 ; Check alignment | |
| jz .aligned | |
| sub rsp, 8 ; Align if needed | |
| .aligned: | |
| call function | |
| ; Compiler ensures alignment | |
| ; Before call: RSP mod 16 = 8 | |
| ; CALL pushes 8-byte return address | |
| ; In function: RSP mod 16 = 0</code></pre> | |
| <h4 id="red-zone"><strong>Red Zone</strong></h4> | |
| <pre class="assembly"><code>; 128-byte red zone below RSP (System V AMD64) | |
| ; Can be used without adjusting RSP | |
| leaf_function: | |
| mov [rsp - 8], rdi ; Use red zone | |
| mov [rsp - 16], rsi | |
| ; ... computation ... | |
| mov rax, [rsp - 8] | |
| ret | |
| ; Signal handlers and kernel must respect red zone | |
| ; Windows x64 has no red zone!</code></pre> | |
| <h3 id="memory-barriers-and-atomics"><strong>3.5 Memory Barriers and | |
| Atomics</strong></h3> | |
| <h4 id="memory-ordering"><strong>Memory Ordering</strong></h4> | |
| <pre class="assembly"><code>; x86-64 memory model (Total Store Order) | |
| ; Guarantees: | |
| ; - Loads are not reordered with loads | |
| ; - Stores are not reordered with stores | |
| ; - Stores are not reordered with older loads | |
| ; - Loads may be reordered with older stores | |
| ; Memory barriers | |
| mfence ; Full barrier | |
| sfence ; Store barrier | |
| lfence ; Load barrier + speculation barrier</code></pre> | |
| <h4 id="atomic-operations"><strong>Atomic Operations</strong></h4> | |
| <pre class="assembly"><code>; LOCK prefix for atomicity | |
| lock add [rax], rbx ; Atomic add | |
| lock xchg [rax], rbx ; XCHG is implicitly locked | |
| lock cmpxchg [rax], rbx ; Compare and exchange | |
| ; Lock-free patterns | |
| atomic_increment: | |
| mov rax, 1 | |
| lock xadd [rdi], rax ; Fetch-and-add | |
| inc rax ; Return old + 1 | |
| ret | |
| ; Compare-and-swap loop | |
| cas_loop: | |
| mov rax, [rdi] ; Load current value | |
| .retry: | |
| mov rdx, rax | |
| add rdx, 1 ; Compute new value | |
| lock cmpxchg [rdi], rdx ; Try to update | |
| jnz .retry ; Retry if changed | |
| ret</code></pre> | |
| <h4 id="transactional-memory-tsx"><strong>Transactional Memory | |
| (TSX)</strong></h4> | |
| <pre class="assembly"><code>; Hardware Lock Elision (HLE) | |
| xacquire lock add [rax], rbx ; Begin transaction | |
| xrelease lock sub [rax], rbx ; End transaction | |
| ; Restricted Transactional Memory (RTM) | |
| transaction: | |
| xbegin .abort ; Start transaction | |
| ; ... transactional code ... | |
| mov rax, [shared_data] | |
| add rax, 1 | |
| mov [shared_data], rax | |
| xend ; Commit transaction | |
| jmp .done | |
| .abort: | |
| ; Handle abort (check EAX for reason) | |
| and eax, 0xFF ; Abort status | |
| cmp eax, 0xFF ; Explicit abort? | |
| je .fallback | |
| ; Retry logic... | |
| .fallback: | |
| ; Non-transactional path | |
| .done:</code></pre> | |
| <h3 id="effective-address-calculation-lea"><strong>3.6 Effective Address | |
| Calculation (LEA)</strong></h3> | |
| <h4 id="lea-instruction-capabilities"><strong>LEA Instruction | |
| Capabilities</strong></h4> | |
| <pre class="assembly"><code>; LEA performs address calculation without memory access | |
| ; Useful for arithmetic and address computation | |
| ; Simple arithmetic | |
| lea rax, [rbx + 5] ; rax = rbx + 5 | |
| lea rax, [rbx + rcx] ; rax = rbx + rcx | |
| ; Scaled arithmetic | |
| lea rax, [rbx*2] ; rax = rbx * 2 | |
| lea rax, [rbx + rbx*2] ; rax = rbx * 3 | |
| lea rax, [rbx + rbx*4] ; rax = rbx * 5 | |
| lea rax, [rbx + rbx*8] ; rax = rbx * 9 | |
| ; Complex calculations | |
| lea rax, [rbx + rcx*4 + 10] ; rax = rbx + rcx*4 + 10 | |
| ; Three-operand arithmetic | |
| lea rax, [rdi + rsi] ; rax = rdi + rsi (preserves both)</code></pre> | |
| <h4 id="compiler-lea-patterns"><strong>Compiler LEA | |
| Patterns</strong></h4> | |
| <div class="sourceCode" id="cb64"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb64-1"><a href="#cb64-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Array indexing</span></span> | |
| <span id="cb64-2"><a href="#cb64-2" aria-hidden="true" tabindex="-1"></a><span class="dt">int</span><span class="op">*</span> array_element<span class="op">(</span><span class="dt">int</span><span class="op">*</span> base<span class="op">,</span> <span class="dt">long</span> i<span class="op">,</span> <span class="dt">long</span> j<span class="op">,</span> <span class="dt">long</span> stride<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb64-3"><a href="#cb64-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> <span class="op">&</span>base<span class="op">[</span>i <span class="op">*</span> stride <span class="op">+</span> j<span class="op">];</span></span> | |
| <span id="cb64-4"><a href="#cb64-4" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb64-5"><a href="#cb64-5" aria-hidden="true" tabindex="-1"></a><span class="co">// Generates:</span></span> | |
| <span id="cb64-6"><a href="#cb64-6" aria-hidden="true" tabindex="-1"></a><span class="co">// lea rax, [rdx + rsi]</span></span> | |
| <span id="cb64-7"><a href="#cb64-7" aria-hidden="true" tabindex="-1"></a><span class="co">// lea rax, [rdi + rax*4]</span></span> | |
| <span id="cb64-8"><a href="#cb64-8" aria-hidden="true" tabindex="-1"></a><span class="co">// ret</span></span> | |
| <span id="cb64-9"><a href="#cb64-9" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb64-10"><a href="#cb64-10" aria-hidden="true" tabindex="-1"></a><span class="co">// Structure offset calculation</span></span> | |
| <span id="cb64-11"><a href="#cb64-11" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Large <span class="op">{</span> <span class="dt">char</span> data<span class="op">[</span><span class="dv">1024</span><span class="op">];</span> <span class="op">};</span></span> | |
| <span id="cb64-12"><a href="#cb64-12" aria-hidden="true" tabindex="-1"></a>Large<span class="op">*</span> next_element<span class="op">(</span>Large<span class="op">*</span> ptr<span class="op">,</span> <span class="dt">long</span> offset<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb64-13"><a href="#cb64-13" aria-hidden="true" tabindex="-1"></a> <span class="cf">return</span> ptr <span class="op">+</span> offset<span class="op">;</span></span> | |
| <span id="cb64-14"><a href="#cb64-14" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb64-15"><a href="#cb64-15" aria-hidden="true" tabindex="-1"></a><span class="co">// Generates:</span></span> | |
| <span id="cb64-16"><a href="#cb64-16" aria-hidden="true" tabindex="-1"></a><span class="co">// shl rsi, 10 ; offset * 1024</span></span> | |
| <span id="cb64-17"><a href="#cb64-17" aria-hidden="true" tabindex="-1"></a><span class="co">// lea rax, [rdi + rsi]</span></span> | |
| <span id="cb64-18"><a href="#cb64-18" aria-hidden="true" tabindex="-1"></a><span class="co">// ret</span></span></code></pre></div> | |
| <h4 id="lea-vs-other-instructions"><strong>LEA vs Other | |
| Instructions</strong></h4> | |
| <pre class="assembly"><code>; LEA advantages: | |
| ; - No flags modification | |
| ; - Three-operand form | |
| ; - Single-cycle execution (simple forms) | |
| ; Comparison: x = y * 5 | |
| ; Using IMUL: | |
| mov rax, rdi | |
| imul rax, 5 ; 3-cycle latency | |
| ; Using LEA: | |
| lea rax, [rdi + rdi*4] ; 1-cycle latency | |
| ; Complex LEA can be slower | |
| lea rax, [rbx + rcx*8 + 1000] ; 3-cycle latency on some CPUs</code></pre> | |
| <h3 id="compiler-memory-optimization-strategies"><strong>3.7 Compiler | |
| Memory Optimization Strategies</strong></h3> | |
| <h4 id="structure-layout-and-padding"><strong>Structure Layout and | |
| Padding</strong></h4> | |
| <div class="sourceCode" id="cb66"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb66-1"><a href="#cb66-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler structure padding</span></span> | |
| <span id="cb66-2"><a href="#cb66-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Unoptimized <span class="op">{</span></span> | |
| <span id="cb66-3"><a href="#cb66-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">char</span> a<span class="op">;</span> <span class="co">// Offset 0</span></span> | |
| <span id="cb66-4"><a href="#cb66-4" aria-hidden="true" tabindex="-1"></a> <span class="co">// 7 bytes padding</span></span> | |
| <span id="cb66-5"><a href="#cb66-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">double</span> b<span class="op">;</span> <span class="co">// Offset 8</span></span> | |
| <span id="cb66-6"><a href="#cb66-6" aria-hidden="true" tabindex="-1"></a> <span class="dt">char</span> c<span class="op">;</span> <span class="co">// Offset 16</span></span> | |
| <span id="cb66-7"><a href="#cb66-7" aria-hidden="true" tabindex="-1"></a> <span class="co">// 3 bytes padding</span></span> | |
| <span id="cb66-8"><a href="#cb66-8" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> d<span class="op">;</span> <span class="co">// Offset 20</span></span> | |
| <span id="cb66-9"><a href="#cb66-9" aria-hidden="true" tabindex="-1"></a><span class="op">};</span> <span class="co">// Size: 24 bytes</span></span> | |
| <span id="cb66-10"><a href="#cb66-10" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb66-11"><a href="#cb66-11" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Optimized <span class="op">{</span></span> | |
| <span id="cb66-12"><a href="#cb66-12" aria-hidden="true" tabindex="-1"></a> <span class="dt">double</span> b<span class="op">;</span> <span class="co">// Offset 0</span></span> | |
| <span id="cb66-13"><a href="#cb66-13" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> d<span class="op">;</span> <span class="co">// Offset 8</span></span> | |
| <span id="cb66-14"><a href="#cb66-14" aria-hidden="true" tabindex="-1"></a> <span class="dt">char</span> a<span class="op">;</span> <span class="co">// Offset 12</span></span> | |
| <span id="cb66-15"><a href="#cb66-15" aria-hidden="true" tabindex="-1"></a> <span class="dt">char</span> c<span class="op">;</span> <span class="co">// Offset 13</span></span> | |
| <span id="cb66-16"><a href="#cb66-16" aria-hidden="true" tabindex="-1"></a> <span class="co">// 2 bytes padding</span></span> | |
| <span id="cb66-17"><a href="#cb66-17" aria-hidden="true" tabindex="-1"></a><span class="op">};</span> <span class="co">// Size: 16 bytes</span></span> | |
| <span id="cb66-18"><a href="#cb66-18" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb66-19"><a href="#cb66-19" aria-hidden="true" tabindex="-1"></a><span class="co">// Assembly access patterns</span></span> | |
| <span id="cb66-20"><a href="#cb66-20" aria-hidden="true" tabindex="-1"></a><span class="co">// Unoptimized:</span></span> | |
| <span id="cb66-21"><a href="#cb66-21" aria-hidden="true" tabindex="-1"></a><span class="co">// movzx eax, byte [rdi] ; a</span></span> | |
| <span id="cb66-22"><a href="#cb66-22" aria-hidden="true" tabindex="-1"></a><span class="co">// movsd xmm0, [rdi + 8] ; b</span></span> | |
| <span id="cb66-23"><a href="#cb66-23" aria-hidden="true" tabindex="-1"></a><span class="co">// movzx ecx, byte [rdi + 16] ; c</span></span> | |
| <span id="cb66-24"><a href="#cb66-24" aria-hidden="true" tabindex="-1"></a><span class="co">// mov edx, [rdi + 20] ; d</span></span> | |
| <span id="cb66-25"><a href="#cb66-25" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb66-26"><a href="#cb66-26" aria-hidden="true" tabindex="-1"></a><span class="co">// Optimized:</span></span> | |
| <span id="cb66-27"><a href="#cb66-27" aria-hidden="true" tabindex="-1"></a><span class="co">// movsd xmm0, [rdi] ; b</span></span> | |
| <span id="cb66-28"><a href="#cb66-28" aria-hidden="true" tabindex="-1"></a><span class="co">// mov edx, [rdi + 8] ; d</span></span> | |
| <span id="cb66-29"><a href="#cb66-29" aria-hidden="true" tabindex="-1"></a><span class="co">// movzx eax, byte [rdi + 12] ; a</span></span> | |
| <span id="cb66-30"><a href="#cb66-30" aria-hidden="true" tabindex="-1"></a><span class="co">// movzx ecx, byte [rdi + 13] ; c</span></span></code></pre></div> | |
| <h4 id="loop-optimization-and-memory-access"><strong>Loop Optimization | |
| and Memory Access</strong></h4> | |
| <pre class="assembly"><code>; Original loop | |
| .loop1: | |
| mov rax, [rdi] | |
| add rax, [rsi] | |
| mov [rdx], rax | |
| add rdi, 8 | |
| add rsi, 8 | |
| add rdx, 8 | |
| dec rcx | |
| jnz .loop1 | |
| ; Unrolled and optimized | |
| .loop2: | |
| ; Prefetch next iteration | |
| prefetcht0 [rdi + 64] | |
| prefetcht0 [rsi + 64] | |
| ; Process 4 elements at once | |
| mov rax, [rdi] | |
| mov rbx, [rdi + 8] | |
| mov r8, [rdi + 16] | |
| mov r9, [rdi + 24] | |
| add rax, [rsi] | |
| add rbx, [rsi + 8] | |
| add r8, [rsi + 16] | |
| add r9, [rsi + 24] | |
| mov [rdx], rax | |
| mov [rdx + 8], rbx | |
| mov [rdx + 16], r8 | |
| mov [rdx + 24], r9 | |
| add rdi, 32 | |
| add rsi, 32 | |
| add rdx, 32 | |
| sub rcx, 4 | |
| jnz .loop2</code></pre> | |
| <h4 id="alias-analysis-and-optimization"><strong>Alias Analysis and | |
| Optimization</strong></h4> | |
| <div class="sourceCode" id="cb68"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb68-1"><a href="#cb68-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Compiler must assume pointers may alias</span></span> | |
| <span id="cb68-2"><a href="#cb68-2" aria-hidden="true" tabindex="-1"></a><span class="dt">void</span> may_alias<span class="op">(</span><span class="dt">int</span><span class="op">*</span> a<span class="op">,</span> <span class="dt">int</span><span class="op">*</span> b<span class="op">,</span> <span class="dt">int</span><span class="op">*</span> c<span class="op">,</span> <span class="dt">int</span> n<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb68-3"><a href="#cb68-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> <span class="op">(</span><span class="dt">int</span> i <span class="op">=</span> <span class="dv">0</span><span class="op">;</span> i <span class="op"><</span> n<span class="op">;</span> i<span class="op">++)</span> <span class="op">{</span></span> | |
| <span id="cb68-4"><a href="#cb68-4" aria-hidden="true" tabindex="-1"></a> a<span class="op">[</span>i<span class="op">]</span> <span class="op">=</span> b<span class="op">[</span>i<span class="op">]</span> <span class="op">+</span> c<span class="op">[</span>i<span class="op">];</span> <span class="co">// Must reload c[i] each time</span></span> | |
| <span id="cb68-5"><a href="#cb68-5" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span></span> | |
| <span id="cb68-6"><a href="#cb68-6" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb68-7"><a href="#cb68-7" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb68-8"><a href="#cb68-8" aria-hidden="true" tabindex="-1"></a><span class="co">// With restrict keyword</span></span> | |
| <span id="cb68-9"><a href="#cb68-9" aria-hidden="true" tabindex="-1"></a><span class="dt">void</span> no_alias<span class="op">(</span><span class="dt">int</span><span class="op">*</span> <span class="dt">restrict</span> a<span class="op">,</span> <span class="dt">int</span><span class="op">*</span> <span class="dt">restrict</span> b<span class="op">,</span> </span> | |
| <span id="cb68-10"><a href="#cb68-10" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span><span class="op">*</span> <span class="dt">restrict</span> c<span class="op">,</span> <span class="dt">int</span> n<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb68-11"><a href="#cb68-11" aria-hidden="true" tabindex="-1"></a> <span class="co">// Compiler can optimize more aggressively</span></span> | |
| <span id="cb68-12"><a href="#cb68-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">for</span> <span class="op">(</span><span class="dt">int</span> i <span class="op">=</span> <span class="dv">0</span><span class="op">;</span> i <span class="op"><</span> n<span class="op">;</span> i<span class="op">++)</span> <span class="op">{</span></span> | |
| <span id="cb68-13"><a href="#cb68-13" aria-hidden="true" tabindex="-1"></a> a<span class="op">[</span>i<span class="op">]</span> <span class="op">=</span> b<span class="op">[</span>i<span class="op">]</span> <span class="op">+</span> c<span class="op">[</span>i<span class="op">];</span></span> | |
| <span id="cb68-14"><a href="#cb68-14" aria-hidden="true" tabindex="-1"></a> <span class="op">}</span></span> | |
| <span id="cb68-15"><a href="#cb68-15" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span> | |
| <span id="cb68-16"><a href="#cb68-16" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb68-17"><a href="#cb68-17" aria-hidden="true" tabindex="-1"></a><span class="co">// Assembly difference:</span></span> | |
| <span id="cb68-18"><a href="#cb68-18" aria-hidden="true" tabindex="-1"></a><span class="co">// may_alias inner loop:</span></span> | |
| <span id="cb68-19"><a href="#cb68-19" aria-hidden="true" tabindex="-1"></a><span class="co">// mov eax, [rsi + rcx*4]</span></span> | |
| <span id="cb68-20"><a href="#cb68-20" aria-hidden="true" tabindex="-1"></a><span class="co">// add eax, [rdx + rcx*4] ; Must reload</span></span> | |
| <span id="cb68-21"><a href="#cb68-21" aria-hidden="true" tabindex="-1"></a><span class="co">// mov [rdi + rcx*4], eax</span></span> | |
| <span id="cb68-22"><a href="#cb68-22" aria-hidden="true" tabindex="-1"></a><span class="co">// inc rcx</span></span> | |
| <span id="cb68-23"><a href="#cb68-23" aria-hidden="true" tabindex="-1"></a><span class="co">// cmp rcx, r8</span></span> | |
| <span id="cb68-24"><a href="#cb68-24" aria-hidden="true" tabindex="-1"></a><span class="co">// jl .loop</span></span> | |
| <span id="cb68-25"><a href="#cb68-25" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb68-26"><a href="#cb68-26" aria-hidden="true" tabindex="-1"></a><span class="co">// no_alias can use vector instructions:</span></span> | |
| <span id="cb68-27"><a href="#cb68-27" aria-hidden="true" tabindex="-1"></a><span class="co">// movdqu xmm0, [rsi + rcx*4]</span></span> | |
| <span id="cb68-28"><a href="#cb68-28" aria-hidden="true" tabindex="-1"></a><span class="co">// paddd xmm0, [rdx + rcx*4]</span></span> | |
| <span id="cb68-29"><a href="#cb68-29" aria-hidden="true" tabindex="-1"></a><span class="co">// movdqu [rdi + rcx*4], xmm0</span></span> | |
| <span id="cb68-30"><a href="#cb68-30" aria-hidden="true" tabindex="-1"></a><span class="co">// add rcx, 4</span></span> | |
| <span id="cb68-31"><a href="#cb68-31" aria-hidden="true" tabindex="-1"></a><span class="co">// cmp rcx, r8</span></span> | |
| <span id="cb68-32"><a href="#cb68-32" aria-hidden="true" tabindex="-1"></a><span class="co">// jl .loop</span></span></code></pre></div> | |
| <h4 id="memory-access-coalescing"><strong>Memory Access | |
| Coalescing</strong></h4> | |
| <pre class="assembly"><code>; Inefficient: Multiple small accesses | |
| load_bytes: | |
| movzx eax, byte [rdi] | |
| movzx ecx, byte [rdi + 1] | |
| movzx edx, byte [rdi + 2] | |
| movzx esi, byte [rdi + 3] | |
| ; Efficient: Single coalesced access | |
| load_dword: | |
| mov eax, [rdi] ; Load all 4 bytes | |
| movzx ecx, al ; Extract byte 0 | |
| movzx edx, ah ; Extract byte 1 | |
| shr eax, 16</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="summary-and-key-takeaways"><strong>Summary and Key | |
| Takeaways</strong></h4> | |
| <p>In moving from the “big picture” of x86‑64’s virtual address space | |
| down through the mechanics of base/index/scale encoding, this chapter | |
| shows that memory architecture is one of the richest areas where the | |
| assembler’s low‑level control and the compiler’s high‑level choices | |
| meet.</p> | |
| <ul> | |
| <li><strong>Addressing</strong> — The | |
| <code>[base + index*scale + displacement]</code> model, with | |
| RIP‑relative addressing in long mode, is central to both hand‑written | |
| position‑independent code and compiler‑generated relocatable | |
| binaries.</li> | |
| <li><strong>Segmentation and paging</strong> — While segmentation is | |
| largely gone in 64‑bit mode, FS/GS bases and 4‑/5‑level page tables | |
| still introduce powerful indirection points for per‑thread/per‑CPU data | |
| in systems programming.</li> | |
| <li><strong>Caching and access patterns</strong> — The architecture’s | |
| total store order model and rich cache‑control instructions mean that | |
| both inline assembly loops and compiler auto‑vectorized code can be | |
| strongly influenced by how data is laid out and traversed.</li> | |
| <li><strong>Stack discipline</strong> — ABI‑mandated alignment, red‑zone | |
| usage, and prologue/epilogue conventions are the groundwork on which | |
| safe interoperability with C/C++ runtimes depends.</li> | |
| <li><strong>Atomicity and ordering</strong> — LOCK‑prefixed | |
| instructions, fences, and transactional execution influence everything | |
| from spinlocks to lock‑free data structures.</li> | |
| <li><strong>LEA as a computational tool</strong> — Beyond its name, LEA | |
| is more than “load effective address” — it’s a flexible three‑operand, | |
| flag‑neutral arithmetic builder that compilers lean on heavily.</li> | |
| <li><strong>Compiler optimisation levers</strong> — Structure padding, | |
| alias analysis, unrolling, and access coalescing are examples of how | |
| high‑level awareness directly affects instruction choices and addressing | |
| modes.</li> | |
| </ul> | |
| <p>From an <em>assembly programmer’s</em> perspective, this means | |
| knowing when to pick a particular addressing mode or cache hint and how | |
| to encode it. From a <em>compiler engineer’s</em> perspective, it means | |
| recognising the patterns in the front‑end’s IR that can be lowered to | |
| the most efficient form — or, conversely, where a less‑fancy form will | |
| avoid microarchitectural pitfalls.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="looking-ahead"><strong>Looking Ahead</strong></h4> | |
| <p>The interplay between memory and computation becomes even tighter | |
| when we start adding <strong>vector and floating‑point | |
| extensions</strong> to the mix. In the next chapter, we’ll shift | |
| into:</p> | |
| <blockquote> | |
| <p><strong>Chapter 4: SIMD Fundamentals — SSE, AVX, and | |
| Beyond</strong></p> | |
| </blockquote> | |
| <p>Here we’ll build on the memory foundation to explore:</p> | |
| <ul> | |
| <li><p>How aligned vs. unaligned loads/stores affect SIMD | |
| throughput</p></li> | |
| <li><p>Layout of vector registers and their preservation across | |
| calls</p></li> | |
| <li><p>Instruction encoding with VEX/EVEX and the role of | |
| masking/broadcasting</p></li> | |
| <li><p>Compiler vectorisation strategies and how they map to real | |
| encodings</p></li> | |
| <li><p>The marriage of cache‑friendly access patterns with vector | |
| loops</p></li> | |
| </ul> | |
| <p>This transition is natural: the same addressing forms, alignment | |
| rules, and prefetching hints you’ve just seen in scalar code have direct | |
| consequences for vectorised kernels in both hand‑coded assembly and | |
| compiler‑generated loops.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>If you like, I can proceed immediately with | |
| <strong>Chapter 4</strong> so you can see how we combine the memory | |
| model we’ve covered here with SIMD register usage for high‑performance, | |
| vectorised routines. Would you like me to continue?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-4-stack-operations-and-calling-conventions"><strong>Chapter | |
| 4: Stack Operations and Calling Conventions</strong></h2> | |
| <h3 id="stack-architecture-fundamentals"><strong>4.1 Stack Architecture | |
| Fundamentals</strong></h3> | |
| <h4 id="stack-layout-and-growth-direction"><strong>Stack Layout and | |
| Growth Direction</strong></h4> | |
| <p>The x86-64 stack grows downward from high to low addresses, with RSP | |
| pointing to the top (lowest address) of the stack:</p> | |
| <pre class="assembly"><code>; Stack memory layout (addresses decrease downward) | |
| ; Higher addresses (0x7FFFFFFFFFFF) | |
| ; ┌─────────────────────┐ | |
| ; │ Environment vars │ | |
| ; ├─────────────────────┤ | |
| ; │ Program arguments │ | |
| ; ├─────────────────────┤ | |
| ; │ Stack frames │ ← Stack grows down | |
| ; │ ↓ │ | |
| ; │ [unused space] │ | |
| ; │ ↑ │ | |
| ; │ Heap │ ← Heap grows up | |
| ; ├─────────────────────┤ | |
| ; │ .bss/.data │ | |
| ; ├─────────────────────┤ | |
| ; │ .text │ | |
| ; └─────────────────────┘ | |
| ; Lower addresses (0x400000) | |
| ; Basic stack operations | |
| push rax ; RSP -= 8; [RSP] = RAX | |
| pop rbx ; RBX = [RSP]; RSP += 8 | |
| ; Equivalent manual operations | |
| sub rsp, 8 | |
| mov [rsp], rax ; Same as push rax | |
| mov rbx, [rsp] | |
| add rsp, 8 ; Same as pop rbx</code></pre> | |
| <h4 id="stack-pointer-alignment-requirements"><strong>Stack Pointer | |
| Alignment Requirements</strong></h4> | |
| <pre class="assembly"><code>; System V AMD64 ABI: RSP must be 16-byte aligned before CALL | |
| ; Windows x64 ABI: Same requirement | |
| check_alignment: | |
| ; Stack aligned to 16 bytes at function entry | |
| ; (RSP + 8) mod 16 = 0 after CALL | |
| push rbp ; RSP now 16-byte aligned | |
| mov rbp, rsp | |
| ; Maintain alignment for calls | |
| sub rsp, 24 ; Local space (not 16-byte aligned) | |
| and rsp, -16 ; Force 16-byte alignment | |
| call some_function ; RSP+8 will be 16-byte aligned</code></pre> | |
| <h4 id="stack-frame-structure"><strong>Stack Frame | |
| Structure</strong></h4> | |
| <pre class="assembly"><code>; Standard stack frame layout | |
| ; ┌─────────────────────┐ Higher addresses | |
| ; │ Caller's frame │ | |
| ; ├─────────────────────┤ | |
| ; │ Arguments 7+ │ [rbp + 16 + n*8] | |
| ; ├─────────────────────┤ | |
| ; │ Return address │ [rbp + 8] | |
| ; ├─────────────────────┤ | |
| ; │ Saved RBP │ [rbp] ← RBP points here | |
| ; ├─────────────────────┤ | |
| ; │ Local variables │ [rbp - n] | |
| ; ├─────────────────────┤ | |
| ; │ Saved registers │ | |
| ; ├─────────────────────┤ | |
| ; │ Stack arguments │ | |
| ; └─────────────────────┘ ← RSP points here | |
| ; Lower addresses | |
| function_with_frame: | |
| push rbp ; Save caller's frame pointer | |
| mov rbp, rsp ; Establish new frame | |
| sub rsp, 48 ; Allocate local space | |
| ; Save callee-saved registers if used | |
| push rbx | |
| push r12 | |
| push r13 | |
| ; Function body | |
| mov [rbp - 8], rdi ; Store first param as local | |
| mov [rbp - 16], rsi ; Store second param | |
| ; Restore and return | |
| pop r13 | |
| pop r12 | |
| pop rbx | |
| mov rsp, rbp ; Restore stack pointer | |
| pop rbp ; Restore frame pointer | |
| ret</code></pre> | |
| <h3 id="system-v-amd64-abi"><strong>4.2 System V AMD64 ABI</strong></h3> | |
| <h4 id="register-usage-convention"><strong>Register Usage | |
| Convention</strong></h4> | |
| <pre class="assembly"><code>; Parameter passing registers (in order) | |
| ; Integer/Pointer: RDI, RSI, RDX, RCX, R8, R9 | |
| ; Floating-point: XMM0-XMM7 | |
| ; Register preservation rules: | |
| ; Caller-saved (volatile): RAX, RCX, RDX, RSI, RDI, R8-R11, XMM0-XMM15 | |
| ; Callee-saved (non-volatile): RBX, RBP, R12-R15 | |
| ; Special registers: | |
| ; RAX: Return value (also RDX for 128-bit returns) | |
| ; RSP: Stack pointer (must maintain alignment) | |
| ; RBP: Optional frame pointer</code></pre> | |
| <h4 id="function-calling-examples"><strong>Function Calling | |
| Examples</strong></h4> | |
| <pre class="assembly"><code>; C prototype: long sum6(long a, long b, long c, long d, long e, long f) | |
| call_sum6: | |
| ; First 6 arguments in registers | |
| mov rdi, 1 ; a | |
| mov rsi, 2 ; b | |
| mov rdx, 3 ; c | |
| mov rcx, 4 ; d | |
| mov r8, 5 ; e | |
| mov r9, 6 ; f | |
| call sum6 | |
| ; Result in RAX | |
| ; C prototype: long sum8(long a, long b, long c, long d, | |
| ; long e, long f, long g, long h) | |
| call_sum8: | |
| ; First 6 in registers, rest on stack | |
| push 8 ; h (8th argument) | |
| push 7 ; g (7th argument) | |
| mov rdi, 1 ; a | |
| mov rsi, 2 ; b | |
| mov rdx, 3 ; c | |
| mov rcx, 4 ; d | |
| mov r8, 5 ; e | |
| mov r9, 6 ; f | |
| call sum8 | |
| add rsp, 16 ; Clean up stack arguments</code></pre> | |
| <h4 id="floating-point-and-mixed-arguments"><strong>Floating-Point and | |
| Mixed Arguments</strong></h4> | |
| <pre class="assembly"><code>; C: double compute(int a, double b, float c, long d, double e) | |
| call_compute: | |
| mov edi, 42 ; a (int in EDI) | |
| movsd xmm0, [double_b] ; b (double in XMM0) | |
| movss xmm1, [float_c] ; c (float in XMM1) | |
| mov rsi, 100 ; d (long in RSI) | |
| movsd xmm2, [double_e] ; e (double in XMM2) | |
| call compute | |
| ; Result in XMM0 | |
| ; Structure passing (≤16 bytes passed in registers) | |
| ; struct Point { double x, y; }; // 16 bytes | |
| ; void process_point(Point p); | |
| pass_struct: | |
| movsd xmm0, [point_x] ; First 8 bytes in XMM0 | |
| movsd xmm1, [point_y] ; Second 8 bytes in XMM1 | |
| call process_point</code></pre> | |
| <h4 id="red-zone-usage"><strong>Red Zone Usage</strong></h4> | |
| <pre class="assembly"><code>; 128-byte red zone below RSP (System V AMD64 only!) | |
| ; Leaf functions can use without adjusting RSP | |
| leaf_function: | |
| ; Can use [rsp-128] to [rsp-1] without adjusting RSP | |
| mov [rsp - 8], rdi ; Save parameter | |
| mov [rsp - 16], rsi | |
| ; Computation | |
| add rdi, rsi | |
| imul rdi, [rsp - 8] | |
| mov rax, rdi ; Return value | |
| ret | |
| ; Non-leaf functions CANNOT rely on red zone | |
| non_leaf_function: | |
| sub rsp, 128 ; Must allocate space | |
| mov [rsp + 8], rdi ; Save parameters | |
| mov [rsp + 16], rsi | |
| call other_function ; Call may overwrite red zone | |
| add rsp, 128 | |
| ret</code></pre> | |
| <h4 id="variable-argument-functions"><strong>Variable Argument | |
| Functions</strong></h4> | |
| <pre class="assembly"><code>; C: int printf(const char* format, ...); | |
| ; Requires special handling for variable arguments | |
| call_printf: | |
| ; For varargs, AL = number of vector registers used | |
| lea rdi, [format_string] ; First fixed argument | |
| mov rsi, 42 ; First variable argument | |
| movsd xmm0, [double_val] ; FP argument | |
| mov al, 1 ; 1 XMM register used | |
| call printf | |
| ; Implementing varargs function | |
| varargs_function: | |
| ; Save all potential argument registers | |
| push rdi | |
| push rsi | |
| push rdx | |
| push rcx | |
| push r8 | |
| push r9 | |
| ; Save XMM registers if AL > 0 | |
| test al, al | |
| jz .no_xmm | |
| ; Save XMM0-XMM7 | |
| sub rsp, 128 | |
| movaps [rsp], xmm0 | |
| movaps [rsp + 16], xmm1 | |
| ; ... save remaining XMM registers | |
| .no_xmm: | |
| ; Process arguments using va_list</code></pre> | |
| <h3 id="microsoft-x64-abi"><strong>4.3 Microsoft x64 ABI</strong></h3> | |
| <h4 id="register-convention-differences"><strong>Register Convention | |
| Differences</strong></h4> | |
| <pre class="assembly"><code>; Microsoft x64 calling convention | |
| ; Parameter registers: RCX, RDX, R8, R9 (first 4) | |
| ; Floating-point: XMM0-XMM3 (correspond to parameter position) | |
| ; | |
| ; Caller-saved: RAX, RCX, RDX, R8-R11, XMM0-XMM5 | |
| ; Callee-saved: RBX, RBP, RDI, RSI, R12-R15, XMM6-XMM15 | |
| ; Shadow space requirement (32 bytes always reserved) | |
| ms_x64_caller: | |
| sub rsp, 32 ; Shadow space (mandatory) | |
| mov rcx, 1 ; First argument | |
| mov rdx, 2 ; Second argument | |
| mov r8, 3 ; Third argument | |
| mov r9, 4 ; Fourth argument | |
| call function | |
| add rsp, 32 ; Clean shadow space</code></pre> | |
| <h4 id="function-prologue-and-epilogue-windows"><strong>Function | |
| Prologue and Epilogue (Windows)</strong></h4> | |
| <pre class="assembly"><code>; Windows x64 function with frame pointer | |
| windows_function: | |
| push rbp ; Save frame pointer | |
| push rdi ; Save non-volatile registers | |
| push rsi | |
| push rbx | |
| sub rsp, 32h ; Allocate locals + maintain alignment | |
| lea rbp, [rsp + 32h] ; Frame pointer setup | |
| ; Function body | |
| mov [rbp - 8], rcx ; Save first parameter | |
| ; Epilogue | |
| lea rsp, [rbp] | |
| pop rbx | |
| pop rsi | |
| pop rdi | |
| pop rbp | |
| ret | |
| ; Windows unwind information (for SEH) | |
| .PROC windows_function | |
| push rbp | |
| .pushreg rbp | |
| push rdi | |
| .pushreg rdi | |
| push rsi | |
| .pushreg rsi | |
| push rbx | |
| .pushreg rbx | |
| sub rsp, 32h | |
| .allocstack 32h | |
| lea rbp, [rsp + 32h] | |
| .setframe rbp, 32h | |
| .endprolog | |
| ; Function body | |
| .ENDPROC</code></pre> | |
| <h4 id="floating-point-parameter-passing-windows"><strong>Floating-Point | |
| Parameter Passing (Windows)</strong></h4> | |
| <pre class="assembly"><code>; Windows x64: FP args go in XMM registers matching position | |
| ; void process(int a, double b, int c, float d) | |
| windows_fp_call: | |
| sub rsp, 32 ; Shadow space | |
| mov ecx, 10 ; a (integer in RCX) | |
| movsd xmm1, [double_b] ; b (double in XMM1 - 2nd position) | |
| mov r8d, 20 ; c (integer in R8) | |
| movss xmm3, [float_d] ; d (float in XMM3 - 4th position) | |
| call process | |
| add rsp, 32</code></pre> | |
| <h3 id="stack-frame-management"><strong>4.4 Stack Frame | |
| Management</strong></h3> | |
| <h4 id="frame-pointer-vs-frame-pointer-omission"><strong>Frame Pointer | |
| vs Frame Pointer Omission</strong></h4> | |
| <pre class="assembly"><code>; With frame pointer (traditional, easier debugging) | |
| with_frame_pointer: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 32 ; Locals | |
| mov [rbp - 8], rdi ; Access locals via RBP | |
| mov [rbp - 16], rsi | |
| ; RBP provides stable reference point | |
| leave ; mov rsp, rbp; pop rbp | |
| ret | |
| ; Without frame pointer (optimization) | |
| without_frame_pointer: | |
| sub rsp, 32 ; Locals | |
| mov [rsp + 24], rdi ; Access locals via RSP | |
| mov [rsp + 16], rsi | |
| ; All offsets relative to RSP | |
| ; One more register available (RBP) | |
| add rsp, 32 | |
| ret | |
| ; Compiler chooses based on: | |
| ; -fomit-frame-pointer (GCC/Clang) | |
| ; /Oy (MSVC)</code></pre> | |
| <h4 id="dynamic-stack-allocation-alloca"><strong>Dynamic Stack | |
| Allocation (alloca)</strong></h4> | |
| <pre class="assembly"><code>; Implementing variable-size stack allocation | |
| ; C: void* alloca(size_t size) | |
| my_alloca: | |
| ; RDI contains size (System V AMD64) | |
| add rdi, 15 ; Round up to 16-byte boundary | |
| and rdi, -16 | |
| sub rsp, rdi ; Allocate space | |
| mov rax, rsp ; Return pointer | |
| ret | |
| ; Using dynamic allocation | |
| function_with_vla: | |
| push rbp | |
| mov rbp, rsp | |
| ; Allocate variable-length array | |
| mov rdi, [rbp + 16] ; Get size parameter | |
| shl rdi, 3 ; Multiply by 8 (sizeof(long)) | |
| add rdi, 15 | |
| and rdi, -16 ; Align to 16 bytes | |
| sub rsp, rdi ; Allocate | |
| mov rax, rsp ; RAX points to array | |
| ; Use array... | |
| leave | |
| ret</code></pre> | |
| <h4 id="stack-unwinding-support"><strong>Stack Unwinding | |
| Support</strong></h4> | |
| <pre class="assembly"><code>; DWARF CFI directives (Linux/Unix) | |
| function_with_cfi: | |
| .cfi_startproc | |
| push rbp | |
| .cfi_def_cfa_offset 16 | |
| .cfi_offset rbp, -16 | |
| mov rbp, rsp | |
| .cfi_def_cfa_register rbp | |
| sub rsp, 32 | |
| ; Function body | |
| leave | |
| .cfi_def_cfa rsp, 8 | |
| ret | |
| .cfi_endproc | |
| ; Exception handling frame setup | |
| exception_aware_function: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 32 | |
| ; Set up exception handler | |
| lea rax, [exception_handler] | |
| mov [rsp], rax ; Handler address | |
| ; Code that might throw | |
| call potentially_throwing_function | |
| ; Clean up | |
| add rsp, 32 | |
| pop rbp | |
| ret | |
| exception_handler: | |
| ; Handle exception | |
| ; RSP points to exception record</code></pre> | |
| <h3 id="leaf-vs-non-leaf-functions"><strong>4.5 Leaf vs Non-Leaf | |
| Functions</strong></h3> | |
| <h4 id="leaf-function-optimization"><strong>Leaf Function | |
| Optimization</strong></h4> | |
| <pre class="assembly"><code>; Leaf function (doesn't call other functions) | |
| ; Can use red zone, minimal prologue/epilogue | |
| leaf_strlen: | |
| ; RDI = string pointer (System V AMD64) | |
| xor rax, rax ; Counter | |
| .loop: | |
| cmp byte [rdi + rax], 0 | |
| je .done | |
| inc rax | |
| jmp .loop | |
| .done: | |
| ret ; No stack frame needed | |
| ; Non-leaf equivalent | |
| non_leaf_strlen: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 16 ; Space for locals | |
| mov [rsp], rdi ; Save string pointer | |
| call some_validation ; Calls another function | |
| mov rdi, [rsp] ; Restore pointer | |
| ; ... strlen logic ... | |
| leave | |
| ret</code></pre> | |
| <h4 id="tail-call-optimization"><strong>Tail Call | |
| Optimization</strong></h4> | |
| <pre class="assembly"><code>; Recursive function with tail call | |
| ; long factorial(long n, long acc) | |
| factorial_tail: | |
| test rdi, rdi ; if (n == 0) | |
| jz .base_case | |
| ; Tail call: factorial( | |
| ```assembly | |
| ; factorial(n - 1, acc * n) | |
| imul rsi, rdi ; acc *= n | |
| dec rdi ; n-- | |
| jmp factorial_tail ; Jump, don't push return address | |
| ; No new frame built | |
| .base_case: | |
| mov rax, rsi ; Return accumulated result | |
| ret</code></pre> | |
| <p><strong>Explanation:</strong><br /> | |
| Unlike a typical recursive call (<code>call factorial_tail</code>), here | |
| the function jumps directly to the next invocation without creating a | |
| fresh stack frame. This is possible because there’s no work left to do | |
| after the recursive step — the “tail” position. Tail‑call optimization | |
| (TCO) eliminates stack growth in deep recursion, turning recursion into | |
| iteration at the assembly level.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>✅ <strong>Key Takeaways from Chapter 4:</strong></p> | |
| <ol type="1"> | |
| <li><p><strong>Stack discipline matters:</strong> Both the downward | |
| growth and strict 16‑byte alignment rules are central to correct ABI | |
| conformance on x86‑64.</p></li> | |
| <li><p><strong>Know your ABI differences:</strong> System V AMD64 and | |
| Microsoft x64 rules differ in parameter registers, shadow space, and | |
| red‑zone usage.</p></li> | |
| <li><p><strong>Frame management impacts debugging:</strong> Using a | |
| frame pointer helps tooling and stack traces; omitting it saves a | |
| register and can improve performance.</p></li> | |
| <li><p><strong>Leaf functions can skip overhead:</strong> Red‑zone and | |
| no‑frame-pointer optimizations keep them minimal.</p></li> | |
| <li><p><strong>Tail calls avoid extra frames:</strong> When the last | |
| action is a call, using a jump preserves stack height.</p></li> | |
| </ol> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>If you’d like, I can now continue this progression into | |
| <strong>Chapter 5: SIMD and Vector Extensions</strong>. It will connect | |
| the alignment concepts from Chapter 3 and the stack layout constraints | |
| here to how SSE, AVX, and AVX‑512 arguments are passed, saved, and | |
| restored, with highly optimized examples for numeric processing. Do you | |
| want me to proceed with that?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-5-exception-handling-and-stack-unwinding"><strong>Chapter 5: | |
| Exception Handling and Stack Unwinding</strong></h2> | |
| <h3 id="exception-handling-fundamentals"><strong>5.1 Exception Handling | |
| Fundamentals</strong></h3> | |
| <h4 id="types-of-exceptions-in-x86-64"><strong>Types of Exceptions in | |
| x86-64</strong></h4> | |
| <p>The x86-64 architecture supports multiple exception types, each | |
| requiring different handling mechanisms:</p> | |
| <pre class="assembly"><code>; Hardware exceptions (interrupts/traps/faults) | |
| ; ┌────────────────┬────────┬──────────────────────────────┐ | |
| ; │ Exception │ Vector │ Type & Description │ | |
| ; ├────────────────┼────────┼──────────────────────────────┤ | |
| ; │ Divide Error │ 0 │ Fault - DIV/IDIV by zero │ | |
| ; │ Debug │ 1 │ Fault/Trap - Debug exception │ | |
| ; │ NMI │ 2 │ Interrupt - Non-maskable │ | |
| ; │ Breakpoint │ 3 │ Trap - INT3 instruction │ | |
| ; │ Overflow │ 4 │ Trap - INTO instruction │ | |
| ; │ BOUND Range │ 5 │ Fault - BOUND instruction │ | |
| ; │ Invalid Opcode │ 6 │ Fault - UD2 or invalid │ | |
| ; │ Device Not Av. │ 7 │ Fault - No FPU │ | |
| ; │ Double Fault │ 8 │ Abort - Exception during exc │ | |
| ; │ Invalid TSS │ 10 │ Fault - Task switch error │ | |
| ; │ Segment Not P. │ 11 │ Fault - Segment not present │ | |
| ; │ Stack-Segment │ 12 │ Fault - Stack exception │ | |
| ; │ General Prot. │ 13 │ Fault - Protection violation │ | |
| ; │ Page Fault │ 14 │ Fault - Page not present │ | |
| ; │ x87 FPU Error │ 16 │ Fault - FPU error │ | |
| ; │ Alignment │ 17 │ Fault - Unaligned access │ | |
| ; │ Machine Check │ 18 │ Abort - Hardware error │ | |
| ; │ SIMD FP │ 19 │ Fault - SSE/AVX exception │ | |
| ; └────────────────┴────────┴──────────────────────────────┘ | |
| ; Exception handler entry point | |
| exception_handler: | |
| ; CPU automatically pushes (in order): | |
| ; SS, RSP, RFLAGS, CS, RIP | |
| ; (Error code for some exceptions) | |
| push rax ; Save all registers | |
| push rbx | |
| push rcx | |
| push rdx | |
| push rsi | |
| push rdi | |
| push rbp | |
| push r8 | |
| push r9 | |
| push r10 | |
| push r11 | |
| push r12 | |
| push r13 | |
| push r14 | |
| push r15 | |
| mov rdi, rsp ; Pass exception frame to handler | |
| call handle_exception | |
| pop r15 ; Restore registers | |
| pop r14 | |
| ; ... restore all | |
| add rsp, 8 ; Skip error code if present | |
| iretq ; Return from interrupt</code></pre> | |
| <h4 id="exception-frame-layout"><strong>Exception Frame | |
| Layout</strong></h4> | |
| <pre class="assembly"><code>; Exception stack frame (after CPU push) | |
| ; ┌─────────────────────┐ Higher addresses | |
| ; │ Old SS │ [RSP + 32] | |
| ; ├─────────────────────┤ | |
| ; │ Old RSP │ [RSP + 24] | |
| ; ├─────────────────────┤ | |
| ; │ Old RFLAGS │ [RSP + 16] | |
| ; ├─────────────────────┤ | |
| ; │ Old CS │ [RSP + 8] | |
| ; ├─────────────────────┤ | |
| ; │ Old RIP │ [RSP] ← RSP after exception | |
| ; ├─────────────────────┤ | |
| ; │ Error Code │ (only for some exceptions) | |
| ; └─────────────────────┘ | |
| ; Accessing exception information | |
| get_fault_address: | |
| push rbp | |
| mov rbp, rsp | |
| ; For page fault, CR2 contains fault address | |
| mov rax, cr2 | |
| ; Error code at [rbp + 16] for faults with error code | |
| mov rdx, [rbp + 16] | |
| test rdx, 1 ; Check present bit | |
| jz .not_present | |
| test rdx, 2 ; Check write bit | |
| jnz .write_fault | |
| ; ... handle different fault types</code></pre> | |
| <h3 id="stack-unwinding-mechanisms"><strong>5.2 Stack Unwinding | |
| Mechanisms</strong></h3> | |
| <h4 id="dwarf-cfi-call-frame-information"><strong>DWARF CFI (Call Frame | |
| Information)</strong></h4> | |
| <p>The DWARF format provides detailed unwinding information for | |
| debuggers and exception handlers:</p> | |
| <pre class="assembly"><code>; DWARF CFI directives for stack unwinding | |
| complex_function: | |
| .cfi_startproc | |
| .cfi_personality 0x3, __gxx_personality_v0 | |
| .cfi_lsda 0x3, .LLSDA0 | |
| push rbp | |
| .cfi_def_cfa_offset 16 | |
| .cfi_offset rbp, -16 | |
| mov rbp, rsp | |
| .cfi_def_cfa_register rbp | |
| push rbx | |
| .cfi_offset rbx, -24 | |
| push r12 | |
| .cfi_offset r12, -32 | |
| sub rsp, 48 ; Local variables | |
| ; Function body that might throw | |
| call potentially_throwing_function | |
| ; Cleanup | |
| add rsp, 48 | |
| pop r12 | |
| .cfi_restore r12 | |
| pop rbx | |
| .cfi_restore rbx | |
| pop rbp | |
| .cfi_def_cfa rsp, 8 | |
| ret | |
| .cfi_endproc | |
| ; Exception handling table (LSDA - Language Specific Data Area) | |
| .LLSDA0: | |
| .byte 0xff ; LPStart encoding (omitted) | |
| .byte 0x00 ; TType encoding (absolute) | |
| .uleb128 .LLSDATT0-.LLSDATTD0 | |
| .LLSDATTD0: | |
| .byte 0x01 ; Call site encoding (uleb128) | |
| .uleb128 .LLSDACSE0-.LLSDACSB0 | |
| .LLSDACSB0: | |
| ; Call site table | |
| .uleb128 .LEHB0-.LFB0 ; Start of try block | |
| .uleb128 .LEHE0-.LEHB0 ; Length of try block | |
| .uleb128 .L1-.LFB0 ; Landing pad | |
| .uleb128 0x01 ; Action record | |
| .LLSDACSE0: | |
| .LLSDATT0: | |
| ; Type table and action records</code></pre> | |
| <h4 id="manual-stack-walking"><strong>Manual Stack Walking</strong></h4> | |
| <pre class="assembly"><code>; Stack unwinding without debug info | |
| ; Walk the chain of frame pointers | |
| walk_stack: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 32 | |
| ; RDI = output buffer for addresses | |
| ; RSI = max frames to capture | |
| mov rcx, rsi ; Frame counter | |
| mov r8, rbp ; Current frame pointer | |
| xor rdx, rdx ; Frame index | |
| .walk_loop: | |
| test rcx, rcx | |
| jz .done | |
| ; Validate frame pointer | |
| mov rax, r8 | |
| and rax, 7 ; Check alignment | |
| jnz .invalid_frame | |
| ; Check if readable (simplified) | |
| cmp r8, 0x1000 ; Too low? | |
| jb .invalid_frame | |
| mov r9, 0x7fffffffffff | |
| cmp r8, r9 ; Too high? | |
| ja .invalid_frame | |
| ; Get return address | |
| mov rax, [r8 + 8] | |
| mov [rdi + rdx*8], rax | |
| ; Move to next frame | |
| mov r8, [r8] ; Follow chain | |
| inc rdx | |
| dec rcx | |
| ; Check for end of chain | |
| test r8, r8 | |
| jnz .walk_loop | |
| .done: | |
| mov rax, rdx ; Return frame count | |
| leave | |
| ret | |
| .invalid_frame: | |
| mov rax, -1 ; Error | |
| leave | |
| ret</code></pre> | |
| <h3 id="seh-structured-exception-handling-on-windows"><strong>5.3 SEH | |
| (Structured Exception Handling) on Windows</strong></h3> | |
| <h4 id="seh-frame-setup"><strong>SEH Frame Setup</strong></h4> | |
| <pre class="assembly"><code>; Windows x64 Structured Exception Handling | |
| ; Uses function tables and unwind info | |
| seh_protected_function: | |
| ; Function prolog with SEH markers | |
| push rbp | |
| .pushreg rbp | |
| mov rbp, rsp | |
| .setframe rbp, 0 | |
| sub rsp, 64 | |
| .allocstack 64 | |
| ; Save non-volatile registers | |
| mov [rbp - 8], rbx | |
| .savereg rbx, -8 | |
| mov [rbp - 16], rsi | |
| .savereg rsi, -16 | |
| mov [rbp - 24], rdi | |
| .savereg rdi, -24 | |
| .endprolog | |
| ; Set up exception handler | |
| lea rcx, [exception_filter] | |
| lea rdx, [exception_handler] | |
| call __C_specific_handler_install | |
| ; Protected code block | |
| .try_begin: | |
| call risky_operation | |
| test rax, rax | |
| jz .error_path | |
| ; Normal execution continues | |
| jmp .try_end | |
| .error_path: | |
| ; Trigger exception | |
| mov rcx, 0xC0000005 ; Access violation code | |
| call RaiseException | |
| .try_end: | |
| ; Cleanup and return | |
| mov rbx, [rbp - 8] | |
| mov rsi, [rbp - 16] | |
| mov rdi, [rbp - 24] | |
| leave | |
| ret | |
| exception_filter: | |
| ; RCX = EXCEPTION_POINTERS | |
| mov rax, [rcx] ; EXCEPTION_RECORD | |
| mov rdx, [rax] ; Exception code | |
| cmp rdx, 0xC0000005 ; Access violation? | |
| je .handle_it | |
| mov eax, 0 ; EXCEPTION_CONTINUE_SEARCH | |
| ret | |
| .handle_it: | |
| mov eax, 1 ; EXCEPTION_EXECUTE_HANDLER | |
| ret | |
| exception_handler: | |
| ; Handle the exception | |
| ; Can modify context to resume execution | |
| ret</code></pre> | |
| <h4 id="unwind-information-structure"><strong>Unwind Information | |
| Structure</strong></h4> | |
| <pre class="assembly"><code>; Windows x64 unwind information | |
| ; Located in .pdata and .xdata sections | |
| ; .pdata entry (RUNTIME_FUNCTION) | |
| .section .pdata | |
| .long function_start ; Begin address (RVA) | |
| .long function_end ; End address (RVA) | |
| .long unwind_info ; Unwind info address (RVA) | |
| ; .xdata entry (UNWIND_INFO) | |
| .section .xdata | |
| unwind_info: | |
| .byte 0x01 ; Version:Flags (1:0) | |
| .byte prolog_size ; Size of prolog | |
| .byte unwind_code_count ; Count of unwind codes | |
| .byte frame_register:4 ; Frame register | |
| .byte frame_offset:4 ; Frame register offset (scaled) | |
| ; Unwind codes array | |
| .byte prolog_offset_1 ; Offset in prolog | |
| .byte unwind_op_1:4 ; Operation | |
| .byte op_info_1:4 ; Operation info | |
| ; UWOP_PUSH_NONVOL = 0 | |
| ; UWOP_ALLOC_LARGE = 1 | |
| ; UWOP_ALLOC_SMALL = 2 | |
| ; UWOP_SET_FPREG = 3 | |
| ; UWOP_SAVE_NONVOL = 4 | |
| ; UWOP_SAVE_XMM128 = 8</code></pre> | |
| <h3 id="c-exception-handling-implementation"><strong>5.4 C++ Exception | |
| Handling Implementation</strong></h3> | |
| <h4 id="itanium-abi-exception-model-gccclang"><strong>Itanium ABI | |
| Exception Model (GCC/Clang)</strong></h4> | |
| <pre class="assembly"><code>; C++ try/catch implementation details | |
| cpp_exception_example: | |
| .cfi_startproc | |
| .cfi_personality 0x3, __gxx_personality_v0 | |
| .cfi_lsda 0x3, .LLSDA1 | |
| push rbp | |
| .cfi_def_cfa_offset 16 | |
| .cfi_offset rbp, -16 | |
| mov rbp, rsp | |
| .cfi_def_cfa_register rbp | |
| sub rsp, 32 ; Space for exception object | |
| .LEHB0: ; Begin exception region | |
| ; Allocate exception object | |
| mov edi, 16 ; Size of exception | |
| call __cxa_allocate_exception | |
| mov rbx, rax ; Save exception pointer | |
| ; Construct exception object | |
| mov rdi, rbx | |
| lea rsi, [exception_message] | |
| call std::runtime_error::runtime_error | |
| ; Throw exception | |
| mov rdi, rbx ; Exception object | |
| lea rsi, [_ZTISt13runtime_error] ; Type info | |
| xor edx, edx ; No destructor | |
| call __cxa_throw ; Never returns | |
| .LEHE0: ; End exception region | |
| ; Normal return path (unreachable after throw) | |
| xor eax, eax | |
| leave | |
| .cfi_def_cfa rsp, 8 | |
| ret | |
| .L1: ; Landing pad (catch handler) | |
| .cfi_def_cfa rbp, 16 | |
| mov rdi, rax ; Exception object | |
| call __cxa_begin_catch | |
| ; Handle exception | |
| mov rdi, rax | |
| call process_exception | |
| call __cxa_end_catch | |
| ; Continue execution | |
| xor eax, eax | |
| leave | |
| ret | |
| .cfi_endproc | |
| ; Personality routine (called during unwinding) | |
| ; Determines if frame can handle exception | |
| __gxx_personality_v0: | |
| ; Complex logic to: | |
| ; 1. Parse LSDA (Language Specific Data Area) | |
| ; 2. Check type matching | |
| ; 3. Find appropriate catch handler | |
| ; 4. Execute cleanup code</code></pre> | |
| <h4 id="raii-and-destructor-calls-during-unwinding"><strong>RAII and | |
| Destructor Calls During Unwinding</strong></h4> | |
| <pre class="assembly"><code>; Automatic destructor calls during stack unwinding | |
| function_with_raii: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 64 | |
| ; Construct local object with destructor | |
| lea rdi, [rbp - 32] ; Object address | |
| call MyClass::MyClass | |
| ; Register destructor for unwinding | |
| lea rdi, [rbp - 32] | |
| lea rsi, [MyClass::~MyClass] | |
| call __cxa_push_cleanup | |
| .try_block: | |
| ; Code that might throw | |
| lea rdi, [rbp - 32] | |
| call MyClass::riskyOperation | |
| ; Normal cleanup | |
| lea rdi, [rbp - 32] | |
| call MyClass::~MyClass | |
| leave | |
| ret | |
| .cleanup_landing_pad: | |
| ; Called during exception unwinding | |
| push rax ; Save exception | |
| ; Call destructor | |
| lea rdi, [rbp - 32] | |
| call MyClass::~MyClass | |
| pop rdi ; Restore exception | |
| call _Unwind_Resume ; Continue unwinding</code></pre> | |
| <h3 id="signal-handling-and-asynchronous-exceptions"><strong>5.5 Signal | |
| Handling and Asynchronous Exceptions</strong></h3> | |
| <h4 id="posix-signal-frame"><strong>POSIX Signal Frame</strong></h4> | |
| <pre class="assembly"><code>; Signal handler with siginfo | |
| signal_handler: | |
| ; RDI = signal number | |
| ; RSI = siginfo_t* | |
| ; RDX = ucontext_t* | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 32 | |
| ; Save parameters | |
| mov [rbp - 8], rdi ; Signal number | |
| mov [rbp - 16], rsi ; siginfo_t | |
| mov [rbp - 24], rdx ; ucontext_t | |
| ; Access | |
| # **Chapter 5: Exception Handling and Stack Unwinding** | |
| This chapter covers how the x86‑64 architecture and its ABI conventions implement **hardware exceptions**, **OS-level signal delivery**, and **language/runtime-level exception unwinding**. We draw from **Intel’s architecture manuals**, **System V AMD64 ABI**, **Windows x64 ABI**, and **C++ runtime conventions (Itanium ABI for GCC/Clang)**. | |
| + -- | |
| ## **5.1 Exception Basics in x86‑64** | |
| ### Hardware Exceptions | |
| On Intel/AMD‑64 CPUs, exceptions are synchronous events triggered by execution faults (divide‑by‑zero, page faults, invalid opcodes, GP faults, alignment checks, etc.). They are handled through the **IDT (Interrupt Descriptor Table)** — each exception vector points to an ISR (interrupt service routine) or trap handler. | |
| **Fault vs Trap vs Abort:** | |
| + **Fault:** Restartable at faulting instruction (e.g., page fault). | |
| + **Trap:** Return after completing the current instruction (e.g., breakpoint `INT3`). | |
| + **Abort:** Non-recoverable (e.g., machine check, double fault). | |
| Example: Divide‑by‑zero handler skeleton: | |
| ```asm | |
| section .text | |
| global div_by_zero_handler | |
| div_by_zero_handler: | |
| ; CPU pushes RIP, CS, RFLAGS, possibly error code | |
| push rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15 | |
| mov rdi, rsp ; pointer to saved registers | |
| call handle_div_error ; our C or asm routine | |
| pop r15 r14 r13 r12 r11 r10 r9 r8 rbp rdi rsi rdx rcx rbx rax | |
| add rsp, 8 ; skip error code if present | |
| iretq</code></pre> | |
| <p><strong>Exception frame</strong> layout follows the architecture’s | |
| push order; for faults with an error code, the code is pushed before | |
| RIP.</p> | |
| <p>On <strong>page faults</strong>, CR2 holds the faulting linear | |
| address. The error code bits tell us:</p> | |
| <ul> | |
| <li><p>Bit 0: Present?</p></li> | |
| <li><p>Bit 1: Write access?</p></li> | |
| <li><p>Bit 2: User mode?</p></li> | |
| <li><p>Bit 3: Reserved bit violation?</p></li> | |
| <li><p>Bit 4: Instruction fetch?</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h3 id="stack-unwinding-fundamentals"><strong>5.2 Stack Unwinding | |
| Fundamentals</strong></h3> | |
| <p>Occasionally, we need to <strong>walk up the stack</strong> to find | |
| calling functions — either to produce a backtrace (debugging) or to run | |
| cleanups during exceptions.</p> | |
| <h4 id="frame-pointer-chaining">Frame Pointer Chaining</h4> | |
| <p>Many compilers emit a standard frame pointer chain | |
| (<code>RBP</code>), allowing manual stack walking:</p> | |
| <div class="sourceCode" id="cb95"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb95-1"><a href="#cb95-1" aria-hidden="true" tabindex="-1"></a><span class="fu">walk_stack:</span></span> | |
| <span id="cb95-2"><a href="#cb95-2" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="kw">rbp</span> <span class="co">; current frame pointer</span></span> | |
| <span id="cb95-3"><a href="#cb95-3" aria-hidden="true" tabindex="-1"></a><span class="fu">.loop:</span></span> | |
| <span id="cb95-4"><a href="#cb95-4" aria-hidden="true" tabindex="-1"></a> <span class="bu">test</span> <span class="kw">rax</span><span class="op">,</span> <span class="kw">rax</span></span> | |
| <span id="cb95-5"><a href="#cb95-5" aria-hidden="true" tabindex="-1"></a> <span class="cf">jz</span> <span class="op">.</span>done</span> | |
| <span id="cb95-6"><a href="#cb95-6" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rcx</span><span class="op">,</span> <span class="op">[</span><span class="kw">rax</span><span class="op">+</span><span class="dv">8</span><span class="op">]</span> <span class="co">; return address</span></span> | |
| <span id="cb95-7"><a href="#cb95-7" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rdx</span><span class="op">,</span> <span class="op">[</span><span class="kw">rax</span><span class="op">]</span> <span class="co">; previous frame pointer</span></span> | |
| <span id="cb95-8"><a href="#cb95-8" aria-hidden="true" tabindex="-1"></a> <span class="co">; save rcx somewhere...</span></span> | |
| <span id="cb95-9"><a href="#cb95-9" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="kw">rdx</span></span> | |
| <span id="cb95-10"><a href="#cb95-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">jmp</span> <span class="op">.</span>loop</span> | |
| <span id="cb95-11"><a href="#cb95-11" aria-hidden="true" tabindex="-1"></a><span class="fu">.done:</span></span> | |
| <span id="cb95-12"><a href="#cb95-12" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span></code></pre></div> | |
| <p>If <strong>frame pointer omission</strong> is enabled, DWARF Call | |
| Frame Information (CFI) or Windows unwind info is needed.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="dwarf-cfi-system-v-amd64"><strong>5.3 DWARF CFI (System V | |
| AMD64)</strong></h3> | |
| <p>The <strong>System V AMD64 ABI</strong> defines how unwinders locate | |
| a function’s call frame:</p> | |
| <ul> | |
| <li><p><strong>.cfi_startproc / .cfi_endproc</strong> delimit function’s | |
| unwind metadata.</p></li> | |
| <li><p><code>.cfi_def_cfa_register</code> selects the CFA (Canonical | |
| Frame Address) register.</p></li> | |
| <li><p><code>.cfi_offset</code> declares where each saved register lives | |
| relative to CFA.</p></li> | |
| </ul> | |
| <p>Example with possible throw:</p> | |
| <div class="sourceCode" id="cb96"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb96-1"><a href="#cb96-1" aria-hidden="true" tabindex="-1"></a>.cfi_startproc</span> | |
| <span id="cb96-2"><a href="#cb96-2" aria-hidden="true" tabindex="-1"></a>.cfi_personality <span class="bn">0x3</span><span class="op">,</span> __gxx_personality_v0</span> | |
| <span id="cb96-3"><a href="#cb96-3" aria-hidden="true" tabindex="-1"></a><span class="bu">push</span> <span class="kw">rbp</span></span> | |
| <span id="cb96-4"><a href="#cb96-4" aria-hidden="true" tabindex="-1"></a>.cfi_def_cfa_offset <span class="dv">16</span></span> | |
| <span id="cb96-5"><a href="#cb96-5" aria-hidden="true" tabindex="-1"></a>.cfi_offset <span class="kw">rbp</span><span class="op">,</span> <span class="op">-</span><span class="dv">16</span></span> | |
| <span id="cb96-6"><a href="#cb96-6" aria-hidden="true" tabindex="-1"></a><span class="bu">mov</span> <span class="kw">rbp</span><span class="op">,</span> <span class="kw">rsp</span></span> | |
| <span id="cb96-7"><a href="#cb96-7" aria-hidden="true" tabindex="-1"></a>.cfi_def_cfa_register <span class="kw">rbp</span></span> | |
| <span id="cb96-8"><a href="#cb96-8" aria-hidden="true" tabindex="-1"></a><span class="bu">push</span> <span class="kw">rbx</span></span> | |
| <span id="cb96-9"><a href="#cb96-9" aria-hidden="true" tabindex="-1"></a>.cfi_offset <span class="kw">rbx</span><span class="op">,</span> <span class="op">-</span><span class="dv">24</span></span> | |
| <span id="cb96-10"><a href="#cb96-10" aria-hidden="true" tabindex="-1"></a><span class="co">; body...</span></span> | |
| <span id="cb96-11"><a href="#cb96-11" aria-hidden="true" tabindex="-1"></a><span class="bu">pop</span> <span class="kw">rbx</span></span> | |
| <span id="cb96-12"><a href="#cb96-12" aria-hidden="true" tabindex="-1"></a>.cfi_restore <span class="kw">rbx</span></span> | |
| <span id="cb96-13"><a href="#cb96-13" aria-hidden="true" tabindex="-1"></a><span class="bu">pop</span> <span class="kw">rbp</span></span> | |
| <span id="cb96-14"><a href="#cb96-14" aria-hidden="true" tabindex="-1"></a>.cfi_def_cfa <span class="kw">rsp</span><span class="op">,</span> <span class="dv">8</span></span> | |
| <span id="cb96-15"><a href="#cb96-15" aria-hidden="true" tabindex="-1"></a><span class="cf">ret</span></span> | |
| <span id="cb96-16"><a href="#cb96-16" aria-hidden="true" tabindex="-1"></a>.cfi_endproc</span></code></pre></div> | |
| <p>This data is consumed by <code>_Unwind_RaiseException</code> inside | |
| libgcc_s or libc++abi for C++ stack unwinding.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="windows-x64-seh-and-unwind-info"><strong>5.4 Windows x64 SEH and | |
| Unwind Info</strong></h3> | |
| <p>Windows uses <strong>Structured Exception Handling (SEH)</strong> and | |
| <strong>unwind metadata</strong> in <code>.pdata</code> and | |
| <code>.xdata</code> sections.</p> | |
| <ul> | |
| <li><p><strong>.pdata</strong>: runtime function table entries | |
| (start/end RVA, pointer to unwind info).</p></li> | |
| <li><p><strong>UNWIND_INFO</strong>: describes prolog, saved registers, | |
| and optional exception handler pointer.</p></li> | |
| </ul> | |
| <p>Example SEH-protected function:</p> | |
| <div class="sourceCode" id="cb97"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb97-1"><a href="#cb97-1" aria-hidden="true" tabindex="-1"></a>seh_fn PROC</span> | |
| <span id="cb97-2"><a href="#cb97-2" aria-hidden="true" tabindex="-1"></a> <span class="bu">push</span> <span class="kw">rbp</span></span> | |
| <span id="cb97-3"><a href="#cb97-3" aria-hidden="true" tabindex="-1"></a> .pushreg <span class="kw">rbp</span></span> | |
| <span id="cb97-4"><a href="#cb97-4" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rbp</span><span class="op">,</span> <span class="kw">rsp</span></span> | |
| <span id="cb97-5"><a href="#cb97-5" aria-hidden="true" tabindex="-1"></a> .setframe <span class="kw">rbp</span><span class="op">,</span><span class="dv">0</span></span> | |
| <span id="cb97-6"><a href="#cb97-6" aria-hidden="true" tabindex="-1"></a> <span class="bu">sub</span> <span class="kw">rsp</span><span class="op">,</span> <span class="bn">40h</span></span> | |
| <span id="cb97-7"><a href="#cb97-7" aria-hidden="true" tabindex="-1"></a> .allocstack <span class="bn">40h</span></span> | |
| <span id="cb97-8"><a href="#cb97-8" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="op">[</span><span class="kw">rbp</span><span class="op">-</span><span class="dv">8</span><span class="op">],</span> <span class="kw">rbx</span></span> | |
| <span id="cb97-9"><a href="#cb97-9" aria-hidden="true" tabindex="-1"></a> .savereg <span class="kw">rbx</span><span class="op">,</span> <span class="op">-</span><span class="dv">8</span></span> | |
| <span id="cb97-10"><a href="#cb97-10" aria-hidden="true" tabindex="-1"></a> .endprolog</span> | |
| <span id="cb97-11"><a href="#cb97-11" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb97-12"><a href="#cb97-12" aria-hidden="true" tabindex="-1"></a> <span class="co">; risk code</span></span> | |
| <span id="cb97-13"><a href="#cb97-13" aria-hidden="true" tabindex="-1"></a> <span class="cf">call</span> may_fault</span> | |
| <span id="cb97-14"><a href="#cb97-14" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb97-15"><a href="#cb97-15" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rbx</span><span class="op">,</span> <span class="op">[</span><span class="kw">rbp</span><span class="op">-</span><span class="dv">8</span><span class="op">]</span></span> | |
| <span id="cb97-16"><a href="#cb97-16" aria-hidden="true" tabindex="-1"></a> <span class="bu">leave</span></span> | |
| <span id="cb97-17"><a href="#cb97-17" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span> | |
| <span id="cb97-18"><a href="#cb97-18" aria-hidden="true" tabindex="-1"></a>seh_fn ENDP</span></code></pre></div> | |
| <p>The Windows unwind codes (UWOP_PUSH_NONVOL, UWOP_ALLOC_SMALL, etc.) | |
| tell RtlUnwind how to restore registers and stack.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="language-level-exception-flow-itanium-c-abi"><strong>5.5 | |
| Language-Level Exception Flow (Itanium C++ ABI)</strong></h3> | |
| <p>GCC and Clang on AMD64 Linux/macOS implement the <strong>Itanium | |
| ABI</strong> personality function model:</p> | |
| <ul> | |
| <li><p>Each try block is a “call site” in LSDA (Language Specific Data | |
| Area).</p></li> | |
| <li><p>On throw, the unwinder calls the <strong>personality | |
| function</strong> with each frame’s LSDA to evaluate catches and | |
| destructors.</p></li> | |
| <li><p>RAII destructors are called automatically during | |
| unwinding.</p></li> | |
| </ul> | |
| <p>Example throw/catch at asm level:</p> | |
| <div class="sourceCode" id="cb98"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb98-1"><a href="#cb98-1" aria-hidden="true" tabindex="-1"></a><span class="fu">try_block_start:</span></span> | |
| <span id="cb98-2"><a href="#cb98-2" aria-hidden="true" tabindex="-1"></a> <span class="co">; might throw</span></span> | |
| <span id="cb98-3"><a href="#cb98-3" aria-hidden="true" tabindex="-1"></a> <span class="cf">call</span> risky_op</span> | |
| <span id="cb98-4"><a href="#cb98-4" aria-hidden="true" tabindex="-1"></a><span class="fu">try_block_end:</span></span> | |
| <span id="cb98-5"><a href="#cb98-5" aria-hidden="true" tabindex="-1"></a> <span class="co">; normal path</span></span> | |
| <span id="cb98-6"><a href="#cb98-6" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span> | |
| <span id="cb98-7"><a href="#cb98-7" aria-hidden="true" tabindex="-1"></a><span class="fu">catch_lpad:</span></span> | |
| <span id="cb98-8"><a href="#cb98-8" aria-hidden="true" tabindex="-1"></a> <span class="cf">call</span> __cxa_begin_catch</span> | |
| <span id="cb98-9"><a href="#cb98-9" aria-hidden="true" tabindex="-1"></a> <span class="co">; handle</span></span> | |
| <span id="cb98-10"><a href="#cb98-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">call</span> __cxa_end_catch</span> | |
| <span id="cb98-11"><a href="#cb98-11" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span></code></pre></div> | |
| <p>Throwing (<code>__cxa_throw</code>) never returns; stack unwinding | |
| proceeds via <code>_Unwind_RaiseException</code>.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="signals-posix-asynchronous-exceptions"><strong>5.6 Signals | |
| (POSIX Asynchronous Exceptions)</strong></h3> | |
| <p>Unix signals (SIGSEGV, SIGFPE, SIGILL, etc.) are delivered | |
| asynchronously. The kernel sets up a <strong>signal frame</strong> | |
| containing register context (<code>ucontext_t</code>), so a handler can | |
| inspect/modify state.</p> | |
| <div class="sourceCode" id="cb99"><pre class="sourceCode c"><code class="sourceCode c"><span id="cb99-1"><a href="#cb99-1" aria-hidden="true" tabindex="-1"></a><span class="dt">void</span> handler<span class="op">(</span><span class="dt">int</span> sig<span class="op">,</span> siginfo_t <span class="op">*</span>si<span class="op">,</span> <span class="dt">void</span> <span class="op">*</span>ctx<span class="op">)</span> <span class="op">{</span></span> | |
| <span id="cb99-2"><a href="#cb99-2" aria-hidden="true" tabindex="-1"></a> ucontext_t <span class="op">*</span>uc <span class="op">=</span> ctx<span class="op">;</span></span> | |
| <span id="cb99-3"><a href="#cb99-3" aria-hidden="true" tabindex="-1"></a> printf<span class="op">(</span><span class="st">"FaultAddr=</span><span class="sc">%p</span><span class="st"> RIP=</span><span class="sc">%llx\n</span><span class="st">"</span><span class="op">,</span> si<span class="op">-></span>si_addr<span class="op">,</span></span> | |
| <span id="cb99-4"><a href="#cb99-4" aria-hidden="true" tabindex="-1"></a> <span class="op">(</span><span class="dt">unsigned</span> <span class="dt">long</span> <span class="dt">long</span><span class="op">)</span>uc<span class="op">-></span>uc_mcontext<span class="op">.</span>gregs<span class="op">[</span>REG_RIP<span class="op">]);</span></span> | |
| <span id="cb99-5"><a href="#cb99-5" aria-hidden="true" tabindex="-1"></a><span class="op">}</span></span></code></pre></div> | |
| <p>At asm level, the OS switches to the handler much like an interrupt, | |
| saving all state.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="practical-stack-unwinding-example"><strong>5.7 Practical Stack | |
| Unwinding Example</strong></h3> | |
| <p>Here’s how a backtrace is implemented with DWARF:</p> | |
| <ol type="1"> | |
| <li><p>Locate unwind info for current RIP.</p></li> | |
| <li><p>Apply unwind ops to recover caller’s CFA and register | |
| set.</p></li> | |
| <li><p>Repeat for each frame until the top.</p></li> | |
| </ol> | |
| <p>On Windows: use <code>RtlCaptureContext</code> and | |
| <code>RtlLookupFunctionEntry</code>.</p> | |
| <p>On Linux: <code>libunwind</code> or | |
| <code>_Unwind_Backtrace</code>.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="key-points"><strong>Key Points:</strong></h3> | |
| <ul> | |
| <li><p><strong>Hardware exceptions</strong> are defined in the CPU docs; | |
| handling them requires saving/restoring full CPU state.</p></li> | |
| <li><p><strong>Stack unwinding</strong> depends on ABI-provided metadata | |
| (DWARF or Windows unwind info) if frame pointers aren’t | |
| available.</p></li> | |
| <li><p><strong>Language runtimes</strong> (C++/Rust/Go) integrate with | |
| these mechanisms via personality functions and metadata tables.</p></li> | |
| <li><p><strong>Asynchronous signals</strong> use an OS-delivered stack | |
| frame; care must be taken for async-signal safety.</p></li> | |
| <li><p>RAII destructors and cleanup handlers run <em>during</em> | |
| unwinding automatically.</p></li> | |
| <li><p>In cross-platform code, <strong>System V AMD64 ABI</strong> and | |
| <strong>Microsoft x64 ABI</strong> have distinct unwind metadata | |
| formats.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p>If you want, I can <strong>append detailed Intel/AMD references from | |
| the attached manuals</strong> — e.g., excerpts from Volume 3 on IDT, | |
| exception vectors, and CR2 usage — so this chapter directly cites | |
| official architecture specs for each mechanism.<br /> | |
| Do you want me to integrate those official reference snippets next?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-6-x87-fpu-and-legacy-floating-point"><strong>Chapter 6: | |
| x87 FPU and Legacy Floating Point</strong></h2> | |
| <h3 id="x87-fpu-architecture-overview"><strong>6.1 x87 FPU Architecture | |
| Overview</strong></h3> | |
| <h4 id="x87-register-stack-model"><strong>x87 Register Stack | |
| Model</strong></h4> | |
| <p>The x87 FPU uses a unique stack-based architecture with eight 80-bit | |
| floating-point registers (ST0-ST7), unlike the flat register model of | |
| modern SSE/AVX:</p> | |
| <pre class="assembly"><code>; x87 FPU Register Stack | |
| ; ┌─────────────────────────────────────┐ | |
| ; │ ST(0) - Top of Stack (TOS) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(1) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(2) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(3) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(4) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(5) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(6) │ 80 bits | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ST(7) │ 80 bits | |
| ; └─────────────────────────────────────┘ | |
| ; 80-bit Extended Precision Format | |
| ; ┌───┬────────────────┬──────────────────────────────────┐ | |
| ; │ S │ Exponent (15) │ Significand (64 bits) │ | |
| ; └───┴────────────────┴──────────────────────────────────┘ | |
| ; Bit: 79 78-64 63-0 | |
| ; Basic stack operations | |
| fld_example: | |
| fld dword [float_value] ; Push 32-bit float onto stack | |
| fld qword [double_value] ; Push 64-bit double | |
| fld tbyte [extended_value] ; Push 80-bit extended | |
| ; Stack now: ST(0) = extended, ST(1) = double, ST(2) = float | |
| faddp ; ST(1) = ST(1) + ST(0), pop | |
| fstp qword [result] ; Store and pop</code></pre> | |
| <h4 id="x87-control-and-status-words"><strong>x87 Control and Status | |
| Words</strong></h4> | |
| <pre class="assembly"><code>; x87 Control Word (FCW) - 16 bits | |
| ; ┌──┬──┬──┬──┬──┬──┬────┬────┬──┬──┬──┬──┬──┬──┬──┬──┐ | |
| ; │X │ RC │ PC │XX│ PM │UM │OM │ZM │DM │IM │ | |
| ; └──┴──┴──┴──┴──┴──┴────┴────┴──┴──┴──┴──┴──┴──┴──┴──┘ | |
| ; Bits: 15-13 12-11 10-9 8-7 6 5 4 3 2 1 0 | |
| ; | |
| ; RC = Rounding Control (00=nearest, 01=down, 10=up, 11=truncate) | |
| ; PC = Precision Control (00=single, 10=double, 11=extended) | |
| ; Exception Masks: PM=Precision, UM=Underflow, OM=Overflow, | |
| ; ZM=Zero divide, DM=Denormal, IM=Invalid | |
| ; x87 Status Word (FSW) - 16 bits | |
| ; ┌──┬────────┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐ | |
| ; │B │ TOP │C3│ ST │C2│C1│C0│ES│SF│PE│UE│OE│ZE│DE│IE│ | |
| ; └──┴────────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘ | |
| ; Bits: 15 14-11 10-8 7 6 5 4 3 2 1 0 | |
| ; | |
| ; B = Busy, TOP = Stack top pointer | |
| ; C0-C3 = Condition codes | |
| ; ES = Error summary, SF = Stack fault | |
| ; Exception flags: PE=Precision, UE=Underflow, OE=Overflow, | |
| ; ZE=Zero divide, DE=Denormal, IE=Invalid | |
| control_word_setup: | |
| sub rsp, 16 | |
| ; Get current control word | |
| fstcw [rsp] | |
| mov ax, [rsp] | |
| ; Set rounding mode to truncate (11b) | |
| or ax, 0x0C00 ; Set bits 11-10 to 11 | |
| mov [rsp + 2], ax | |
| fldcw [rsp + 2] ; Load new control word | |
| ; Do computation with truncation | |
| fld qword [value] | |
| frndint ; Round to integer using current mode | |
| ; Restore original control word | |
| fldcw [rsp] | |
| add rsp, 16 | |
| ret</code></pre> | |
| <h3 id="x87-instruction-categories"><strong>6.2 x87 Instruction | |
| Categories</strong></h3> | |
| <h4 id="data-transfer-instructions"><strong>Data Transfer | |
| Instructions</strong></h4> | |
| <pre class="assembly"><code>; Loading values onto the stack | |
| load_operations: | |
| fld dword [mem32] ; Load 32-bit float | |
| fld qword [mem64] ; Load 64-bit double | |
| fld tbyte [mem80] ; Load 80-bit extended | |
| fld st(2) ; Duplicate ST(2) to ST(0) | |
| fld1 ; Push 1.0 | |
| fldz ; Push 0.0 | |
| fldpi ; Push π | |
| fldl2e ; Push log₂(e) | |
| fldl2t ; Push log₂(10) | |
| fldlg2 ; Push log₁₀(2) | |
| fldln2 ; Push ln(2) | |
| ; Storing values from the stack | |
| store_operations: | |
| fst dword [mem32] ; Store ST(0) as float (no pop) | |
| fstp qword [mem64] ; Store ST(0) as double and pop | |
| fstp tbyte [mem80] ; Store ST(0) as extended and pop | |
| fist word [mem16] ; Store ST(0) as 16-bit int | |
| fistp dword [mem32] ; Store as 32-bit int and pop | |
| fistp qword [mem64] ; Store as 64-bit int and pop | |
| ; Integer loading | |
| integer_load: | |
| fild word [int16] ; Load 16-bit integer | |
| fild dword [int32] ; Load 32-bit integer | |
| fild qword [int64] ; Load 64-bit integer | |
| ; Exchange operations | |
| fxch ; Exchange ST(0) with ST(1) | |
| fxch st(3) ; Exchange ST(0) with ST(3)</code></pre> | |
| <h4 id="arithmetic-operations"><strong>Arithmetic | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; Basic arithmetic | |
| arithmetic_ops: | |
| ; Addition | |
| fadd st(0), st(1) ; ST(0) = ST(0) + ST(1) | |
| fadd dword [mem32] ; ST(0) = ST(0) + mem32 | |
| faddp st(2), st(0) ; ST(2) = ST(2) + ST(0), pop | |
| fiadd word [int16] ; ST(0) = ST(0) + (float)int16 | |
| ; Subtraction | |
| fsub st(0), st(1) ; ST(0) = ST(0) - ST(1) | |
| fsubr st(0), st(1) ; ST(0) = ST(1) - ST(0) (reverse) | |
| fsubp ; ST(1) = ST(1) - ST(0), pop | |
| fisubr dword [int32] ; ST(0) = int32 - ST(0) | |
| ; Multiplication | |
| fmul st(0), st(2) ; ST(0) = ST(0) * ST(2) | |
| fmulp st(1), st(0) ; ST(1) = ST(1) * ST(0), pop | |
| fimul word [int16] ; ST(0) = ST(0) * int16 | |
| ; Division | |
| fdiv st(0), st(1) ; ST(0) = ST(0) / ST(1) | |
| fdivr st(0), st(1) ; ST(0) = ST(1) / ST(0) (reverse) | |
| fdivp ; ST(1) = ST(1) / ST(0), pop | |
| fidiv dword [int32] ; ST(0) = ST(0) / int32 | |
| ; Other operations | |
| fsqrt ; ST(0) = sqrt(ST(0)) | |
| fabs ; ST(0) = |ST(0)| | |
| fchs ; ST(0) = -ST(0) | |
| frndint ; ST(0) = round(ST(0))</code></pre> | |
| <h4 id="transcendental-functions"><strong>Transcendental | |
| Functions</strong></h4> | |
| <pre class="assembly"><code>; Trigonometric functions | |
| trig_operations: | |
| ; Calculate sin(x) | |
| fld qword [angle] ; Load angle in radians | |
| fsin ; ST(0) = sin(ST(0)) | |
| fstp qword [result] | |
| ; Calculate cos(x) | |
| fld qword [angle] | |
| fcos ; ST(0) = cos(ST(0)) | |
| ; Calculate both sin and cos | |
| fld qword [angle] | |
| fsincos ; ST(0) = cos, ST(1) = sin | |
| ; Calculate tan(x) | |
| fld qword [angle] | |
| fptan ; ST(0) = 1.0, ST(1) = tan | |
| fstp st(0) ; Pop the 1.0 | |
| ; Calculate arctan(y/x) | |
| fld qword [y] | |
| fld qword [x] | |
| fpatan ; ST(0) = arctan(ST(1)/ST(0)) | |
| ; Logarithmic and exponential | |
| log_exp_operations: | |
| ; Calculate log₂(x) | |
| fld1 | |
| fld qword [x] | |
| fyl2x ; ST(0) = ST(1) * log₂(ST(0)) | |
| ; Calculate log₁₀(x) = log₂(x) * log₁₀(2) | |
| fldlg2 ; Load log₁₀(2) | |
| fld qword [x] | |
| fyl2x | |
| ; Calculate ln(x) = log₂(x) * ln(2) | |
| fldln2 ; Load ln(2) | |
| fld qword [x] | |
| fyl2x | |
| ; Calculate 2^x | |
| fld qword [x] | |
| f2xm1 ; ST(0) = 2^ST(0) - 1 (for |x| < 1) | |
| fld1 | |
| faddp ; Add 1 to get 2^x | |
| ; Calculate x^y using: x^y = 2^(y*log₂(x)) | |
| fld qword [y] | |
| fld qword [x] | |
| fyl2x ; ST(0) = y * log₂(x) | |
| fld st(0) | |
| frndint ; Get integer part | |
| fsub st(1), st(0) ; Fractional part in ST(1) | |
| fxch | |
| f2xm1 ; 2^frac - 1 | |
| fld1 | |
| faddp ; 2^frac | |
| fscale ; Scale by 2^int</code></pre> | |
| <h3 id="comparison-and-conditional-operations"><strong>6.3 Comparison | |
| and Conditional Operations</strong></h3> | |
| <h4 id="comparison-instructions"><strong>Comparison | |
| Instructions</strong></h4> | |
| <pre class="assembly"><code>; Comparison operations | |
| comparison_ops: | |
| ; Compare and set flags | |
| fcom st(1) ; Compare ST(0) with ST(1) | |
| fcomp dword [mem32] ; Compare with memory and pop | |
| fcompp ; Compare ST(0), ST(1) and pop both | |
| ficom word [int16] ; Compare with integer | |
| ; Unordered compare (handles NaN) | |
| fucom st(1) ; Unordered compare | |
| fucomp st(2) ; Compare and pop | |
| fucompp ; Compare and pop both | |
| ; Test and classify | |
| ftst ; Compare ST(0) with 0.0 | |
| fxam ; Examine ST(0) and set condition codes | |
| ; Transfer flags to CPU | |
| transfer_flags: | |
| ; Method 1: Via AX register | |
| fstsw ax ; Store status word in AX | |
| sahf ; Store AH into FLAGS | |
| ja .greater ; Now can use CPU conditional jumps | |
| jb .less | |
| je .equal | |
| ; Method 2: Via memory | |
| fstsw [status_word] | |
| mov ax, [status_word] | |
| test ax, 0x4500 ; Check C3, C2, C0 bits | |
| ; Floating compare and set EFLAGS directly (P6+) | |
| fcomi st(0), st(1) ; Compare and set ZF, PF, CF | |
| jae .greater_equal ; Can use CPU jumps directly | |
| fucomi st(0), st(1) ; Unordered compare version | |
| jp .unordered ; Jump if unordered (NaN)</code></pre> | |
| <h4 id="conditional-move-fcmovcc"><strong>Conditional Move | |
| (FCMOVcc)</strong></h4> | |
| <pre class="assembly"><code>; Conditional moves based on EFLAGS (P6+) | |
| conditional_moves: | |
| ; Setup comparison in integer unit | |
| cmp eax, ebx | |
| ; Conditional FP moves based on integer flags | |
| fcmovb st(0), st(1) ; Move if below (CF=1) | |
| fcmove st(0), st(2) ; Move if equal (ZF=1) | |
| fcmovbe st(0), st(3) ; Move if below or equal | |
| fcmovnb st(0), st(4) ; Move if not below | |
| fcmovne st(0), st(5) ; Move if not equal | |
| fcmovnbe st(0), st(6) ; Move if not below or equal | |
| fcmovu st(0), st(7) ; Move if unordered (PF=1) | |
| fcmovnu st(0), st(1) ; Move if not unordered | |
| ; Example: Conditional selection | |
| fmax_implementation: | |
| fld qword [value1] | |
| fld qword [value2] ; ST(0) = value2, ST(1) = value1 | |
| fcomi st(0), st(1) ; Compare ST(0) with ST(1) | |
| fcmovb st(0), st(1) ; If ST(0) < ST(1), move ST(1) to ST(0) | |
| fstp qword [maximum] ; Store maximum | |
| fstp st(0) ; Clean stack | |
| ret</code></pre> | |
| <h3 id="exception-handling"><strong>6.4 Exception Handling</strong></h3> | |
| <h4 id="x87-exception-types"><strong>x87 Exception Types</strong></h4> | |
| <pre class="assembly"><code>; x87 exceptions and their handling | |
| exception_handling: | |
| ; Save current control word | |
| sub rsp, 16 | |
| fstcw [rsp] | |
| ; Unmask all exceptions for testing | |
| mov ax, [rsp] | |
| and ax, 0xFFC0 ; Clear all exception mask bits | |
| mov [rsp + 2], ax | |
| fldcw [rsp + 2] | |
| ; Operations that might raise exceptions: | |
| ; 1. Invalid Operation (IE) | |
| fldz | |
| fldz | |
| fdiv st(0), st(1) ; 0/0 = NaN | |
| ; 2. Divide by Zero (ZE) | |
| Continuing **6.4 Exception Handling**, we cover the actual *types*, *masking*, and *recovery* for the x87 FPU, drawing on the Intel SDM Vol.1 Ch.8 and legacy floating‑point sections. | |
| + -- | |
| ### **x87 Exception Types** | |
| The status word's low bits ($IE, DE, ZE, OE, UE, PE$) represent: | |
| | Bit | Flag | Description | | |
| |-----|------|-------------| | |
| | 0 | **IE** | Invalid Operation — e.g., $\sqrt{-1}$, $\log(0)$, $\text{comparison with NaN}$, stack underflow/overflow | | |
| | 1 | **DE** | Denormal Operand — operand is denormalized (subnormal) | | |
| | 2 | **ZE** | Divide‑by‑Zero — divisor is 0.0 | | |
| | 3 | **OE** | Overflow — result too large for chosen precision | | |
| | 4 | **UE** | Underflow — result too small; gradual underflow possible | | |
| | 5 | **PE** | Precision — inexact rounding occurred | | |
| + -- | |
| ### **Masking vs. Unmasking** | |
| The **Control Word** contains *Exception Mask bits* for each of the above. | |
| + **Mask bit = 1** → Exception is *masked* (execution continues, result set to NaN/Inf/ZF/etc.). | |
| + **Mask bit = 0** → Exception is *unmasked* (processor raises `#MF` — x87 Floating-Point Exception). | |
| Example: Unmask only Divide-by-Zero: | |
| ```asm | |
| sub rsp, 16 | |
| fstcw [rsp] ; Save CW | |
| mov ax, [rsp] | |
| and ax, 0xFFFB ; Clear bit 2 (ZM mask) → unmask | |
| mov [rsp+2], ax | |
| fldcw [rsp+2] | |
| fld1 | |
| fldz | |
| fdiv st(0), st(1) ; Should raise #MF if unmasked | |
| fldcw [rsp] ; Restore CW | |
| add rsp, 16</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="exception-service"><strong>Exception Service</strong></h4> | |
| <p>When unmasked and an FP exception occurs:</p> | |
| <ol type="1"> | |
| <li><p>The operation completes or traps immediately.</p></li> | |
| <li><p>FPU sets corresponding flag in <strong>Status Word</strong> and | |
| sets <strong>ES</strong> (Error Summary) bit.</p></li> | |
| <li><p>Processor signals exception handler:</p></li> | |
| </ol> | |
| <ul> | |
| <li>On <em>hardware</em> level: raises <code>#MF</code> on the next FP | |
| instruction (deferred reporting — see Intel SDM “Deferred Floating Point | |
| Exceptions”).</li> | |
| <li>On <em>OS</em> level: Linux delivers <code>SIGFPE</code> with a | |
| <code>siginfo_t</code> pointing to faulting instruction.</li> | |
| </ul> | |
| <p>Deferred means: if you want immediate detection, insert | |
| <code>fwait</code> after expected fault instruction.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="flag-testing-in-software"><strong>Flag Testing in | |
| Software</strong></h4> | |
| <p>You can interrogate the <strong>Status Word</strong> directly:</p> | |
| <div class="sourceCode" id="cb108"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb108-1"><a href="#cb108-1" aria-hidden="true" tabindex="-1"></a><span class="bu">fstsw</span> <span class="kw">ax</span> <span class="co">; Store SW in AX</span></span> | |
| <span id="cb108-2"><a href="#cb108-2" aria-hidden="true" tabindex="-1"></a><span class="bu">test</span> <span class="kw">ax</span><span class="op">,</span> <span class="bn">0x0004</span> <span class="co">; Check ZE (bit 2)</span></span> | |
| <span id="cb108-3"><a href="#cb108-3" aria-hidden="true" tabindex="-1"></a><span class="cf">jnz</span> <span class="op">.</span>div_by_zero</span> | |
| <span id="cb108-4"><a href="#cb108-4" aria-hidden="true" tabindex="-1"></a><span class="bu">test</span> <span class="kw">ax</span><span class="op">,</span> <span class="bn">0x0020</span> <span class="co">; Check PE</span></span> | |
| <span id="cb108-5"><a href="#cb108-5" aria-hidden="true" tabindex="-1"></a><span class="cf">jnz</span> <span class="op">.</span>inexact</span></code></pre></div> | |
| <p>Or mask all but one flag to detect it.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="cooperating-with-os-exception-models"><strong>Cooperating with | |
| OS Exception Models</strong></h4> | |
| <h5 id="unix-like-systems-linuxmacosbsd"><strong>Unix-like systems | |
| (Linux/macOS/BSD)</strong></h5> | |
| <ul> | |
| <li><p>The kernel sets FPU state per-thread and saves/restores on | |
| context switches (<code>FXSAVE</code>/<code>FXRSTOR</code>).</p></li> | |
| <li><p><code>SIGFPE</code> handlers can query FP state with | |
| <code>fenv.h</code> functions:</p></li> | |
| </ul> | |
| <div class="sourceCode" id="cb109"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb109-1"><a href="#cb109-1" aria-hidden="true" tabindex="-1"></a><span class="pp">#include </span><span class="im"><fenv.h></span></span> | |
| <span id="cb109-2"><a href="#cb109-2" aria-hidden="true" tabindex="-1"></a>feclearexcept<span class="op">(</span>FE_ALL_EXCEPT<span class="op">);</span></span> | |
| <span id="cb109-3"><a href="#cb109-3" aria-hidden="true" tabindex="-1"></a><span class="co">// ...</span></span> | |
| <span id="cb109-4"><a href="#cb109-4" aria-hidden="true" tabindex="-1"></a><span class="cf">if</span> <span class="op">(</span>fetestexcept<span class="op">(</span>FE_DIVBYZERO<span class="op">))</span> <span class="op">{</span> <span class="co">/* handle */</span> <span class="op">}</span></span></code></pre></div> | |
| <h5 id="windows-structured-exception-handling"><strong>Windows | |
| (Structured Exception Handling)</strong></h5> | |
| <ul> | |
| <li><p>Unmasked FPU exceptions show as | |
| <code>EXCEPTION_FLT_DIVIDE_BY_ZERO</code>, | |
| <code>EXCEPTION_FLT_OVERFLOW</code>, etc.</p></li> | |
| <li><p>SEH handler gets <code>EXCEPTION_RECORD</code> with | |
| floating-point status.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="modern-context-why-care-in-x8664"><strong>6.5 Modern Context: | |
| Why Care in x86‑64</strong></h4> | |
| <p>Even though SSE/AVX have largely replaced x87 for new code:</p> | |
| <ul> | |
| <li><p>Legacy libraries (especially math and scientific code) may still | |
| use x87 for extended precision.</p></li> | |
| <li><p>Certain ABI contexts preserve x87 stack for | |
| compatibility.</p></li> | |
| <li><p>OS signal/exception machinery is designed to work with both x87 | |
| and SIMD FP exceptions.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p>✅ With 6.4 complete, we can now proceed to <strong>6.5 Transitioning | |
| from x87 to SSE/AVX</strong>, where we’ll show side-by-side x87 vs SSE | |
| implementations of the same tasks, and explain mixed-mode pitfalls | |
| (e.g., register state contamination).</p> | |
| <p>If you want, I can immediately continue with <strong>6.5</strong> to | |
| wrap up this chapter into a full “Legacy to Modern FP” narrative before | |
| we move to <strong>Chapter 7: SIMD Fundamentals (SSE/AVX)</strong>. | |
| Would you like me to do that?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-7-sse-and-sse2-programming"><strong>Chapter 7: SSE and | |
| SSE2 Programming</strong></h2> | |
| <h3 id="ssesse2-architecture-overview"><strong>7.1 SSE/SSE2 Architecture | |
| Overview</strong></h3> | |
| <h4 id="introduction-to-streaming-simd-extensions"><strong>Introduction | |
| to Streaming SIMD Extensions</strong></h4> | |
| <p>SSE (Streaming SIMD Extensions) and SSE2 represent the modern | |
| approach to floating-point and integer SIMD operations in x86-64, | |
| replacing the legacy x87 FPU for most applications:</p> | |
| <pre class="assembly"><code>; SSE/SSE2 Register Architecture | |
| ; ┌─────────────────────────────────────┐ | |
| ; │ XMM0 │ 128 bits (16 bytes) │ | |
| ; ├─────────────────────────────────────┤ | |
| ; │ XMM1 │ 128 bits │ | |
| ; ├─────────────────────────────────────┤ | |
| ; │ XMM2 │ 128 bits │ | |
| ; ├─────────────────────────────────────┤ | |
| ; │ ... │ │ | |
| ; ├─────────────────────────────────────┤ | |
| ; │ XMM15 │ 128 bits (x86-64 only) │ | |
| ; └─────────────────────────────────────┘ | |
| ; Data Types and Packing | |
| ; ┌────────────────────────────────────────────────┐ | |
| ; │ 4 × 32-bit single-precision floats (SSE) │ | |
| ; ├────────────────────────────────────────────────┤ | |
| ; │ 2 × 64-bit double-precision floats (SSE2) │ | |
| ; ├────────────────────────────────────────────────┤ | |
| ; │ 16 × 8-bit integers (SSE2) │ | |
| ; ├────────────────────────────────────────────────┤ | |
| ; │ 8 × 16-bit integers (SSE2) │ | |
| ; ├────────────────────────────────────────────────┤ | |
| ; │ 4 × 32-bit integers (SSE2) │ | |
| ; ├────────────────────────────────────────────────┤ | |
| ; │ 2 × 64-bit integers (SSE2) │ | |
| ; └────────────────────────────────────────────────┘ | |
| ; Basic SSE/SSE2 operation example | |
| sse_intro: | |
| movaps xmm0, [aligned_floats] ; Load 4 floats (must be 16-byte aligned) | |
| movups xmm1, [unaligned_floats] ; Load 4 floats (no alignment required) | |
| addps xmm0, xmm1 ; Add 4 float pairs in parallel | |
| mulps xmm0, xmm2 ; Multiply 4 floats in parallel | |
| movaps [result], xmm0 ; Store result | |
| ret</code></pre> | |
| <h4 id="mxcsr-controlstatus-register"><strong>MXCSR Control/Status | |
| Register</strong></h4> | |
| <pre class="assembly"><code>; MXCSR - 32-bit control and status register | |
| ; ┌──┬──┬──────┬──────┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┬──┐ | |
| ; │FZ│RC│ DAZ │ Res │PM│UM│OM│ZM│DM│IM│PE│UE│OE│ZE│DE│IE│ | |
| ; └──┴──┴──────┴──────┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┴──┘ | |
| ; Bits: 15 14-13 6 5 4 3 2 1 0 | |
| ; | |
| ; FZ = Flush to Zero | |
| ; RC = Rounding Control (00=nearest, 01=down, 10=up, 11=truncate) | |
| ; DAZ = Denormals Are Zero | |
| ; Exception Masks: PM=Precision, UM=Underflow, OM=Overflow, | |
| ; ZM=Zero divide, DM=Denormal, IM=Invalid | |
| ; Exception Flags: PE, UE, OE, ZE, DE, IE (same meanings) | |
| mxcsr_setup: | |
| sub rsp, 8 | |
| ; Get current MXCSR | |
| stmxcsr [rsp] | |
| mov eax, [rsp] | |
| ; Enable flush-to-zero mode for performance | |
| or eax, 0x8000 ; Set FZ bit | |
| or eax, 0x0040 ; Set DAZ bit | |
| mov [rsp], eax | |
| ldmxcsr [rsp] | |
| ; Perform computations | |
| ; ... | |
| add rsp, 8 | |
| ret</code></pre> | |
| <h3 id="sse-floating-point-operations"><strong>7.2 SSE Floating-Point | |
| Operations</strong></h3> | |
| <h4 id="single-precision-scalar-operations"><strong>Single-Precision | |
| Scalar Operations</strong></h4> | |
| <pre class="assembly"><code>; Scalar operations (operate on lowest element only) | |
| scalar_float_ops: | |
| ; Load operations | |
| movss xmm0, [float_val] ; Load single float to low 32 bits | |
| movss xmm1, xmm2 ; Copy low 32 bits, zero upper | |
| ; Arithmetic (suffix 'ss' = scalar single) | |
| addss xmm0, xmm1 ; xmm0[31:0] += xmm1[31:0] | |
| subss xmm0, [memory] ; Subtract from memory | |
| mulss xmm0, xmm2 ; Multiply scalar | |
| divss xmm0, xmm3 ; Divide scalar | |
| sqrtss xmm0, xmm1 ; Square root of scalar | |
| ; Min/Max operations | |
| maxss xmm0, xmm1 ; Maximum of two scalars | |
| minss xmm0, [value] ; Minimum with memory | |
| ; Comparisons (set all bits to 1 or 0) | |
| cmpss xmm0, xmm1, 0 ; Equal (EQ) | |
| cmpss xmm0, xmm1, 1 ; Less than (LT) | |
| cmpss xmm0, xmm1, 2 ; Less than or equal (LE) | |
| cmpss xmm0, xmm1, 3 ; Unordered (NaN check) | |
| cmpss xmm0, xmm1, 4 ; Not equal (NEQ) | |
| cmpss xmm0, xmm1, 5 ; Not less than (NLT) | |
| cmpss xmm0, xmm1, 6 ; Not less than or equal (NLE) | |
| cmpss xmm0, xmm1, 7 ; Ordered (not NaN) | |
| ; Scalar compare and set EFLAGS | |
| ucomiss xmm0, xmm1 ; Compare and set ZF, PF, CF | |
| jbe .less_or_equal ; Can use CPU conditional jumps | |
| comiss xmm0, [value] ; Ordered compare (signals on NaN) | |
| jp .is_nan ; Jump if unordered</code></pre> | |
| <h4 id="single-precision-packed-operations"><strong>Single-Precision | |
| Packed Operations</strong></h4> | |
| <pre class="assembly"><code>; Packed operations (operate on all 4 floats) | |
| packed_float_ops: | |
| ; Aligned loads/stores (16-byte alignment required) | |
| movaps xmm0, [aligned_array] ; Load 4 floats | |
| movaps [result], xmm0 ; Store 4 floats | |
| ; Unaligned loads/stores (slower but no alignment requirement) | |
| movups xmm1, [unaligned_array] ; Load 4 floats unaligned | |
| movups [result], xmm1 ; Store unaligned | |
| ; Arithmetic (suffix 'ps' = packed single) | |
| addps xmm0, xmm1 ; Add 4 float pairs | |
| subps xmm0, [memory] ; Subtract 4 floats | |
| mulps xmm0, xmm2 ; Multiply 4 pairs | |
| divps xmm0, xmm3 ; Divide 4 pairs | |
| sqrtps xmm0, xmm1 ; Square root of 4 floats | |
| rcpps xmm0, xmm1 ; Reciprocal approximation (fast 1/x) | |
| rsqrtps xmm0, xmm1 ; Reciprocal sqrt approximation | |
| ; Min/Max operations | |
| maxps xmm0, xmm1 ; Element-wise maximum | |
| minps xmm0, [array] ; Element-wise minimum | |
| ; Horizontal operations (SSE3) | |
| haddps xmm0, xmm1 ; Horizontal add | |
| ; xmm0[31:0] = xmm0[31:0] + xmm0[63:32] | |
| ; xmm0[63:32] = xmm0[95:64] + xmm0[127:96] | |
| ; xmm0[95:64] = xmm1[31:0] + xmm1[63:32] | |
| ; xmm0[127:96]= xmm1[95:64] + xmm1[127:96] | |
| ; Example: Vector dot product | |
| dot_product_4: | |
| movaps xmm0, [vector_a] ; Load a0, a1, a2, a3 | |
| movaps xmm1, [vector_b] ; Load b0, b1, b2, b3 | |
| mulps xmm0, xmm1 ; a0*b0, a1*b1, a2*b2, a3*b3 | |
| ; Sum all elements (SSE3 version) | |
| haddps xmm0, xmm0 ; a0*b0+a1*b1, a2*b2+a3*b3, ... | |
| haddps xmm0, xmm0 ; Final sum in all positions | |
| movss [result], xmm0 ; Store scalar result | |
| ret</code></pre> | |
| <h4 id="shuffle-and-permute-operations"><strong>Shuffle and Permute | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; Shuffling elements within and between registers | |
| shuffle_operations: | |
| ; shufps - Shuffle packed singles | |
| ; Immediate byte selects which elements: [d3 d2 | d1 d0] | |
| ; d0, d1 select from xmm1 (source) | |
| ; d2, d3 select from xmm0 (destination) | |
| movaps xmm0, [array_a] ; a3, a2, a1, a0 | |
| movaps xmm1, [array_b] ; b3, b2, b1, b0 | |
| shufps xmm0, xmm1, 0xE4 ; 11 10 01 00 binary | |
| ; Result: xmm0 = [a3, a2, b1, b0] | |
| ; movhlps - Move high to low packed single | |
| movhlps xmm0, xmm1 ; xmm0[63:0] = xmm1[127:64] | |
| ; movlhps - Move low to high packed single | |
| movlhps xmm0, xmm1 ; xmm0[127:64] = xmm1[63:0] | |
| ; unpcklps - Unpack and interleave low singles | |
| unpcklps xmm0, xmm1 | |
| ; Result: xmm0 = [b1, a1, b0, a0] | |
| ; unpckhps - Unpack and interleave high singles | |
| unpckhps xmm0, xmm1 | |
| ; Result: xmm0 = [b3, a3, b2, a2]</code></pre> | |
| <h3 id="sse2-double-precision-operations"><strong>7.3 SSE2 | |
| Double-Precision Operations</strong></h3> | |
| <h4 id="double-precision-scalar-and-packed"><strong>Double-Precision | |
| Scalar and Packed</strong></h4> | |
| <pre class="assembly"><code>; Scalar double operations (suffix 'sd') | |
| scalar_double_ops: | |
| movsd xmm0, [double_val] ; Load scalar double | |
| addsd xmm0, xmm1 ; Add scalar doubles | |
| subsd xmm0, [memory] ; Subtract | |
| mulsd xmm0, xmm2 ; Multiply | |
| divsd xmm0, xmm3 ; Divide | |
| sqrtsd xmm0, xmm1 ; Square root | |
| maxsd xmm0, xmm1 ; Maximum | |
| minsd xmm0, xmm1 ; Minimum | |
| ucomisd xmm0, xmm1 ; Compare and set flags | |
| jae .greater_or_equal | |
| ; Packed double operations (suffix 'pd') | |
| packed_double_ops: | |
| movapd xmm0, [aligned_doubles] ; Load 2 doubles (aligned) | |
| movupd xmm1, [unaligned_doubles] ; Load 2 doubles (unaligned) | |
| addpd xmm0, xmm1 ; Add 2 double pairs | |
| subpd xmm0, [memory] ; Subtract 2 doubles | |
| mulpd xmm0, xmm2 ; Multiply 2 pairs | |
| divpd xmm0, xmm3 ; Divide 2 pairs | |
| sqrtpd xmm0, xmm1 ; Square root of 2 doubles | |
| maxpd xmm0, xmm1 ; Element-wise maximum | |
| minpd xmm0, [array] ; Element-wise minimum | |
| ; Horizontal add (SSE3) | |
| haddpd xmm0, xmm1 | |
| ; xmm0[63:0] = xmm0[63:0] + xmm0[127:64] | |
| ; xmm0[127:64]= xmm1[63:0] + xmm1[127:64]</code></pre> | |
| <h3 id="sse2-integer-operations"><strong>7.4 SSE2 Integer | |
| Operations</strong></h3> | |
| <h4 id="integer-data-movement"><strong>Integer Data | |
| Movement</strong></h4> | |
| <pre class="assembly"><code>; Integer move operations | |
| integer_moves: | |
| ; Move aligned/unaligned 128-bit integer data | |
| movdqa xmm0, [aligned_ints] ; Aligned 128-bit move | |
| movdqu xmm1, [unaligned_ints] ; Unaligned 128-bit move | |
| ; Move 32/64-bit integers to/from XMM | |
| movd xmm0, eax ; Move 32-bit int to XMM[31:0] | |
| movd ebx, xmm1 ; Extract XMM[31:0] to 32-bit reg | |
| movq xmm0, rax ; Move 64-bit int to XMM[63:0] | |
| movq rcx, xmm2 ; Extract XMM[63:0] to 64-bit reg | |
| ; Broadcast operations | |
| ; (Later SSE versions add more broadcast instructions) | |
| movd xmm0, eax | |
| pshufd xmm0, xmm0, 0x00 ; Broadcast to all 4 dwords</code></pre> | |
| <h4 id="integer-arithmetic-1"><strong>Integer Arithmetic</strong></h4> | |
| <pre class="assembly"><code>; Packed integer arithmetic | |
| integer_arithmetic: | |
| movdqa xmm0, [int_array_a] | |
| movdqa xmm1, [int_array_b] | |
| ; Addition (b=byte, w=word, d=dword, q=qword) | |
| paddb xmm0, xmm1 ; Add 16 bytes | |
| paddw xmm0, xmm1 ; Add 8 words | |
| paddd xmm0, xmm1 ; Add 4 dwords | |
| paddq xmm0, xmm1 ; Add 2 qwords | |
| ; Subtraction | |
| psubb xmm0, xmm1 ; Subtract 16 bytes | |
| psubw xmm0, xmm1 ; Subtract 8 words | |
| psubd xmm0, xmm1 ; Subtract 4 dwords | |
| psubq xmm0, | |
| Continuing from the SSE2 integer arithmetic discussion, we should complete the rest of the integer capabilities before closing out Chapter 7: | |
| + -- | |
| ### **Integer Arithmetic (continued)** | |
| ```assembly | |
| ; Multiplication | |
| pmullw xmm0, xmm1 ; Multiply 8 signed/unsigned words -> low 16 bits | |
| pmulhw xmm0, xmm1 ; Multiply 8 signed words -> high 16 bits | |
| pmulhuw xmm0, xmm1 ; Multiply 8 unsigned words -> high 16 bits | |
| pmuludq xmm0, xmm1 ; Multiply 2 unsigned doublewords -> quadword results | |
| ; Saturating arithmetic | |
| paddsb xmm0, xmm1 ; Add signed bytes with saturation | |
| paddusb xmm0, xmm1 ; Add unsigned bytes with saturation | |
| paddsw xmm0, xmm1 ; Add signed words with saturation | |
| paddusw xmm0, xmm1 ; Add unsigned words with saturation | |
| psubsb xmm0, xmm1 ; Subtract signed bytes with saturation | |
| psubusb xmm0, xmm1 ; Subtract unsigned bytes with saturation | |
| psubsw xmm0, xmm1 ; Subtract signed words with saturation | |
| psubusw xmm0, xmm1 ; Subtract unsigned words with saturation</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="logical-and-bitwise-operations"><strong>Logical and Bitwise | |
| Operations</strong></h4> | |
| <pre class="assembly"><code> ; Bitwise logical ops | |
| pand xmm0, xmm1 ; Bitwise AND | |
| por xmm0, xmm1 ; Bitwise OR | |
| pxor xmm0, xmm1 ; Bitwise XOR | |
| pandn xmm0, xmm1 ; bitwise AND NOT (¬xmm0 AND xmm1) | |
| ; Compare packed integers | |
| pcmpeqb xmm0, xmm1 ; Compare equal (bytes) | |
| pcmpeqw xmm0, xmm1 ; Compare equal (words) | |
| pcmpeqd xmm0, xmm1 ; Compare equal (dwords) | |
| pcmpgtb xmm0, xmm1 ; Compare greater than (signed bytes) | |
| pcmpgtw xmm0, xmm1 ; Compare greater than (signed words) | |
| pcmpgtd xmm0, xmm1 ; Compare greater than (signed dwords) | |
| ; Shifts | |
| psllw xmm0, xmm1 ; Shift left words by count in xmm1[63:0] | |
| pslld xmm0, xmm1 ; Shift left dwords | |
| psllq xmm0, xmm1 ; Shift left qwords | |
| psraw xmm0, xmm1 ; Shift right arithmetic (signed) words | |
| psrad xmm0, xmm1 ; Shift right arithmetic dwords | |
| psrlw xmm0, xmm1 ; Shift right logical words | |
| psrld xmm0, xmm1 ; Shift right logical dwords | |
| psrlq xmm0, xmm1 ; Shift right logical qwords | |
| ; Shift by immediate | |
| psllw xmm0, 4 ; Shift left words by imm8</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="packing-and-unpacking-integers"><strong>Packing and Unpacking | |
| Integers</strong></h4> | |
| <pre class="assembly"><code> ; Pack with signed saturation | |
| packsswb xmm0, xmm1 ; Pack words to bytes with saturation | |
| packssdw xmm0, xmm1 ; Pack dwords to words with saturation | |
| ; Pack with unsigned saturation | |
| packuswb xmm0, xmm1 ; Pack words to bytes, unsigned saturation | |
| packusdw xmm0, xmm1 ; Pack dwords to words, unsigned saturation | |
| ; Unpack and interleave | |
| punpcklbw xmm0, xmm1 ; Unpack and interleave low-order bytes | |
| punpckhbw xmm0, xmm1 ; Unpack and interleave high-order bytes | |
| punpcklwd xmm0, xmm1 ; Unpack and interleave low-order words | |
| punpckhwd xmm0, xmm1 ; Unpack and interleave high-order words | |
| punpckldq xmm0, xmm1 ; Unpack and interleave low-order dwords | |
| punpckhdq xmm0, xmm1 ; Unpack and interleave high-order dwords</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 | |
| id="conversion-between-integer-and-floating-point"><strong>Conversion | |
| Between Integer and Floating Point</strong></h4> | |
| <p>SSE2 adds rich conversion instructions to move between integer and FP | |
| domains:</p> | |
| <pre class="assembly"><code> ; Integer → float | |
| cvtdq2ps xmm0, xmm1 ; Convert 4 signed dwords to packed single float | |
| cvtpi2ps xmm0, mm1 ; Convert 2 signed dwords to packed single float | |
| cvtpd2ps xmm0, xmm1 ; Convert 2 doubles to 2 single floats | |
| ; Float → integer | |
| cvtps2dq xmm0, xmm1 ; Convert 4 single floats to signed dwords | |
| cvtps2pi mm0, xmm1 ; Convert 2 single floats to signed dwords | |
| cvtps2pd xmm0, xmm1 ; Convert 2 single floats to packed double float | |
| ; Double → integer | |
| cvtpd2dq xmm0, xmm1 ; Convert 2 double floats to signed dwords | |
| cvtpd2pi mm0, xmm1 ; Convert 2 double floats to signed dwords | |
| ; With truncation (round toward zero) | |
| cvttps2dq xmm0, xmm1 ; Convert with truncation | |
| cvttpd2dq xmm0, xmm1 ; Convert 2 doubles to signed dwords (truncate)</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p><strong>Performance Notes:</strong></p> | |
| <ul> | |
| <li><p>Use <code>movaps</code>/<code>movapd</code> for aligned data to | |
| avoid penalties; align dynamic allocations to 16 bytes.</p></li> | |
| <li><p>Group dependent SIMD instructions apart to avoid bypass | |
| delays.</p></li> | |
| <li><p>MXCSR settings for DAZ/FZ can significantly speed up streaming | |
| computations with denormal-sensitive data.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p><strong>Compiler Perspective:</strong></p> | |
| <ul> | |
| <li><p>Modern GCC/Clang/ICC auto-vectorization often emits these SSE2 | |
| ops when <code>-O3</code> is enabled and target supports | |
| <code>-msse2</code>.</p></li> | |
| <li><p>Intrinsics (<code>_mm_*</code>) map directly to these | |
| instructions and avoid manual inline assembly.</p></li> | |
| <li><p>Knowledge of exact ops helps in verifying compiler output and | |
| hand‑tuning bottlenecks.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p>This wraps up <strong>Chapter 7</strong> with a complete set of | |
| SSE/SSE2 float and integer instruction families, their usage patterns, | |
| and tuning considerations.</p> | |
| <p>Next, in <strong>Chapter 8</strong> we will push deeper into | |
| <strong>Advanced SSE Extensions</strong> — SSE3, SSSE3, and SSE4 — | |
| adding horizontal reductions, absolute values, byte shuffles, blends, | |
| and text-processing/vector-masking features.</p> | |
| <p>Do you want me to proceed directly into | |
| <strong>Chapter 8</strong>?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-8-advanced-sse-extensions-sse3-ssse3-sse4"><strong>Chapter | |
| 8: Advanced SSE Extensions (SSE3, SSSE3, SSE4)</strong></h2> | |
| <h3 id="sse3-extensions"><strong>8.1 SSE3 Extensions</strong></h3> | |
| <h4 id="horizontal-arithmetic-operations"><strong>Horizontal Arithmetic | |
| Operations</strong></h4> | |
| <p>SSE3 (introduced with Pentium 4 Prescott) adds horizontal operations | |
| that combine elements within the same register, solving common reduction | |
| patterns:</p> | |
| <pre class="assembly"><code>; Horizontal operations in SSE3 | |
| sse3_horizontal: | |
| ; HADDPS - Horizontal add packed single-precision | |
| movaps xmm0, [vec_a] ; a3, a2, a1, a0 | |
| movaps xmm1, [vec_b] ; b3, b2, b1, b0 | |
| haddps xmm0, xmm1 | |
| ; Result: xmm0 = [b3+b2, b1+b0, a3+a2, a1+a0] | |
| ; HADDPD - Horizontal add packed double-precision | |
| movapd xmm2, [doubles_a] ; a1, a0 | |
| movapd xmm3, [doubles_b] ; b1, b0 | |
| haddpd xmm2, xmm3 | |
| ; Result: xmm2 = [b1+b0, a1+a0] | |
| ; HSUBPS - Horizontal subtract packed single | |
| hsubps xmm0, xmm1 | |
| ; Result: xmm0 = [b3-b2, b1-b0, a3-a2, a1-a0] | |
| ; HSUBPD - Horizontal subtract packed double | |
| hsubpd xmm2, xmm3 | |
| ; Result: xmm2 = [b1-b0, a1-a0] | |
| ; Practical example: Sum all elements in a vector | |
| sum_vector_elements: | |
| movaps xmm0, [vector] ; Load 4 floats | |
| haddps xmm0, xmm0 ; [-, -, sum01, sum23] | |
| haddps xmm0, xmm0 ; [-, -, -, total] | |
| movss [sum], xmm0 ; Store scalar result | |
| ret | |
| ; Alternative mixed add/subtract | |
| addsubps_example: | |
| movaps xmm0, [array_a] | |
| movaps xmm1, [array_b] | |
| addsubps xmm0, xmm1 ; Alternating subtract/add | |
| ; xmm0[31:0] = a0 - b0 | |
| ; xmm0[63:32] = a1 + b1 | |
| ; xmm0[95:64] = a2 - b2 | |
| ; xmm0[127:96]= a3 + b3 | |
| ret</code></pre> | |
| <h4 id="special-move-operations"><strong>Special Move | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; SSE3 move operations | |
| sse3_moves: | |
| ; MOVSLDUP - Move/duplicate low singles | |
| movsldup xmm0, [memory] | |
| ; Result: xmm0 = [src[3], src[3], src[1], src[1]] | |
| ; MOVSHDUP - Move/duplicate high singles | |
| movshdup xmm0, [memory] | |
| ; Result: xmm0 = [src[2], src[2], src[0], src[0]] | |
| ; MOVDDUP - Move/duplicate double | |
| movddup xmm0, [double_val] | |
| ; Result: xmm0 = [src[0], src[0]] (duplicate 64-bit value) | |
| ; LDDQU - Load unaligned 128 bits (optimized for cache-line splits) | |
| lddqu xmm0, [unaligned_data] | |
| ; More efficient than MOVDQU when crossing cache boundaries</code></pre> | |
| <h4 id="x87-fpu-integration-instructions"><strong>x87 FPU Integration | |
| Instructions</strong></h4> | |
| <pre class="assembly"><code>; Monitor/MWait instructions for CPU power management | |
| monitor_wait: | |
| ; Set up monitor address | |
| lea rax, [monitor_addr] | |
| xor ecx, ecx ; No extensions | |
| xor edx, edx ; No hints | |
| monitor ; Set up address monitoring | |
| ; Wait for event or store to monitored address | |
| xor eax, eax ; No hints | |
| xor ecx, ecx ; No extensions | |
| mwait ; Enter optimized waiting state</code></pre> | |
| <h3 id="ssse3-extensions"><strong>8.2 SSSE3 Extensions</strong></h3> | |
| <h4 id="absolute-value-and-sign-operations"><strong>Absolute Value and | |
| Sign Operations</strong></h4> | |
| <p>SSSE3 (Supplemental SSE3, Core 2) adds critical byte-manipulation and | |
| absolute value operations:</p> | |
| <pre class="assembly"><code>; Absolute value operations | |
| ssse3_absolute: | |
| movdqa xmm0, [signed_bytes] | |
| pabsb xmm0, xmm0 ; Absolute value of 16 signed bytes | |
| movdqa xmm1, [signed_words] | |
| pabsw xmm1, xmm1 ; Absolute value of 8 signed words | |
| movdqa xmm2, [signed_dwords] | |
| pabsd xmm2, xmm2 ; Absolute value of 4 signed dwords | |
| ; Sign operations | |
| ssse3_sign: | |
| movdqa xmm0, [data_bytes] | |
| movdqa xmm1, [sign_bytes] | |
| psignb xmm0, xmm1 ; Negate/zero/keep based on sign | |
| ; If xmm1[i] < 0: xmm0[i] = -xmm0[i] | |
| ; If xmm1[i] = 0: xmm0[i] = 0 | |
| ; If xmm1[i] > 0: xmm0[i] = xmm0[i] | |
| psignw xmm2, xmm3 ; Sign operation on words | |
| psignd xmm4, xmm5 ; Sign operation on dwords</code></pre> | |
| <h4 id="horizontal-addition-with-saturation"><strong>Horizontal Addition | |
| with Saturation</strong></h4> | |
| <pre class="assembly"><code>; Horizontal add with saturation | |
| ssse3_hadd: | |
| movdqa xmm0, [bytes_a] | |
| movdqa xmm1, [bytes_b] | |
| ; PHADDW - Horizontal add adjacent pairs of words | |
| phaddw xmm0, xmm1 | |
| ; Each pair of adjacent words is summed | |
| ; PHADDD - Horizontal add adjacent pairs of dwords | |
| phaddd xmm0, xmm1 | |
| ; PHADDSW - Horizontal add words with signed saturation | |
| phaddsw xmm0, xmm1 | |
| ; Horizontal subtract variants | |
| phsubw xmm0, xmm1 ; Horizontal subtract words | |
| phsubd xmm0, xmm1 ; Horizontal subtract dwords | |
| phsubsw xmm0, xmm1 ; Horizontal subtract with saturation</code></pre> | |
| <h4 id="multiply-and-add-packed"><strong>Multiply and Add | |
| Packed</strong></h4> | |
| <pre class="assembly"><code>; Multiply-add operations | |
| ssse3_madd: | |
| movdqa xmm0, [bytes_a] | |
| movdqa xmm1, [bytes_b] | |
| ; PMADDUBSW - Multiply unsigned/signed bytes, add pairs | |
| pmaddubsw xmm0, xmm1 | |
| ; Multiply unsigned bytes from xmm0 with signed bytes from xmm1 | |
| ; Add adjacent products with signed saturation | |
| ; Store 8 word results | |
| ; PMULHRSW - Multiply high with round and scale | |
| pmulhrsw xmm0, xmm1 | |
| ; Multiply signed words, shift right 15, round</code></pre> | |
| <h4 id="byte-shuffle-pshufb"><strong>Byte Shuffle (PSHUFB)</strong></h4> | |
| <p>The most powerful SSSE3 instruction for byte manipulation:</p> | |
| <pre class="assembly"><code>; PSHUFB - Shuffle bytes | |
| byte_shuffle: | |
| movdqa xmm0, [source_bytes] | |
| movdqa xmm1, [shuffle_mask] | |
| pshufb xmm0, xmm1 | |
| ; For each byte position i in result: | |
| ; If xmm1[i] & 0x80: result[i] = 0 | |
| ; Else: result[i] = xmm0[xmm1[i] & 0x0F] | |
| ; Example: Reverse byte order (endian swap) | |
| reverse_bytes: | |
| movdqa xmm0, [data] | |
| movdqa xmm1, [reverse_mask] ; 0F 0E 0D 0C 0B 0A 09 08 07 06 05 04 03 02 01 00 | |
| pshufb xmm0, xmm1 | |
| movdqa [result], xmm0 | |
| ret | |
| ; Example: Extract specific bytes | |
| extract_bytes: | |
| movdqa xmm0, [source] | |
| ; Extract bytes 0, 4, 8, 12, zero the rest | |
| movdqa xmm1, [.mask] | |
| pshufb xmm0, xmm1 | |
| ret | |
| .mask: | |
| db 0x00, 0x04, 0x08, 0x0C ; Positions to extract | |
| db 0x80, 0x80, 0x80, 0x80 ; Zero these positions | |
| db 0x80, 0x80, 0x80, 0x80 | |
| db 0x80, 0x80, 0x80, 0x80</code></pre> | |
| <h4 id="alignment-operations"><strong>Alignment Operations</strong></h4> | |
| <pre class="assembly"><code>; PALIGNR - Concatenate and extract aligned result | |
| alignment_ops: | |
| movdqa xmm0, [buffer_low] | |
| movdqa xmm1, [buffer_high] | |
| ; Extract 16 bytes starting at byte offset 3 | |
| palignr xmm0, xmm1, 3 | |
| ; Concatenates xmm1:xmm0, then extracts bytes [18:3] | |
| ; Use case: Sliding window operations | |
| ; Process overlapping 16-byte windows from a stream | |
| sliding_window: | |
| movdqa xmm0, [window_prev] | |
| movdqa xmm1, [window_curr] | |
| palignr xmm1, xmm0, 4 ; Shift window by 4 bytes | |
| ; Process xmm1... | |
| movdqa xmm0, [window_next] | |
| palignr xmm0, xmm1, 4 ; Continue sliding | |
| ret</code></pre> | |
| <h3 id="sse4.1-extensions"><strong>8.3 SSE4.1 Extensions</strong></h3> | |
| <h4 id="blending-operations"><strong>Blending Operations</strong></h4> | |
| <p>SSE4.1 (Penryn) adds flexible blending and improved integer | |
| operations:</p> | |
| <pre class="assembly"><code>; Blend operations | |
| sse41_blending: | |
| ; BLENDPS - Blend packed single-precision using immediate mask | |
| movaps xmm0, [array_a] | |
| movaps xmm1, [array_b] | |
| blendps xmm0, xmm1, 0b1010 ; Blend using immediate | |
| ; Bit i=0: select from xmm0 | |
| ; Bit i=1: select from xmm1 | |
| ; Result: [b3, a2, b1, a0] | |
| ; BLENDPD - Blend packed double-precision | |
| blendpd xmm2, xmm3, 0b01 ; Select xmm3[63:0], xmm2[127:64] | |
| ; BLENDVPS - Variable blend using sign bit of xmm0 | |
| movaps xmm2, [mask] ; High bit of each element controls | |
| blendvps xmm0, xmm1, xmm0 ; Blend based on sign bits | |
| ; PBLENDVB - Variable byte blend | |
| movdqa xmm0, [bytes_a] | |
| movdqa xmm1, [bytes_b] | |
| pblendvb xmm0, xmm1 ; Blend using high bit of each byte in xmm0 | |
| ; PBLENDW - Blend words with immediate | |
| pblendw xmm0, xmm1, 0xF0 ; Blend high 4 words from xmm1</code></pre> | |
| <h4 id="dot-product-instructions"><strong>Dot Product | |
| Instructions</strong></h4> | |
| <pre class="assembly"><code>; Dot product operations | |
| sse41_dot_product: | |
| ; DPPS - Dot product of packed singles | |
| movaps xmm0, [vector_a] | |
| movaps xmm1, [vector_b] | |
| dpps xmm0, xmm1, 0xF1 | |
| ; Immediate byte: [mask_out:mask_in] | |
| ; mask_in (bits 4-7): Which products to sum | |
| ; mask_out (bits 0-3): Where to store result | |
| ; Example: Full 4-element dot product | |
| dpps xmm0, xmm1, 0xFF ; All products, broadcast to all | |
| ; Example: 3D dot product (ignore 4th element) | |
| dpps xmm0, xmm1, 0x71 ; Products 0,1,2; store in position 0 | |
| ; DPPD - Dot product of packed doubles | |
| dppd xmm2, xmm3, 0x31 ; Dot product, result in low element</code></pre> | |
| <h4 id="rounding-operations"><strong>Rounding Operations</strong></h4> | |
| <pre class="assembly"><code>; Rounding with selectable modes | |
| sse41_rounding: | |
| ; ROUNDPS - Round packed singles | |
| movaps xmm0, [floats] | |
| roundps xmm1, xmm0, 0x00 ; Round to nearest (even) | |
| roundps xmm2, xmm0, 0x01 ; Round down (floor) | |
| roundps xmm3, xmm0, 0x02 ; Round up (ceil) | |
| roundps xmm4, xmm0, 0x03 ; Truncate (toward zero) | |
| roundps xmm5, xmm0, 0x04 ; Use MXCSR.RC field | |
| ; ROUNDPD - Round packed doubles | |
| roundpd xmm1, xmm0, 0x02 ; Ceiling for doubles | |
| ; ROUNDSS/ROUNDSD - Scalar versions | |
| roundss xmm1, xmm0, 0x01 ; Floor single scalar | |
| roundsd xmm1, xmm0, 0x03 ; Truncate double scalar</code></pre> | |
| <h4 id="integer-minmax-operations"><strong>Integer Min/Max | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; Min/Max for more integer types | |
| sse41_minmax: | |
| movdqa xmm0, [ints_a] | |
| movdqa xmm1, [ints_b] | |
| ; Signed operations | |
| pminsb xmm0, xmm1 ; Min of signed bytes | |
| pmaxsb xmm0, xmm1 ; Max of signed bytes | |
| pminsw xmm0, xmm1 ; Min of signed words (SSE2) | |
| pmaxsw xmm0, xmm1 ; Max of signed words (SSE2) | |
| pminsd xmm0, xmm1 ; Min of signed dwords | |
| pmaxsd xmm0, xmm1 ; Max of signed dwords | |
| ; Unsigned operations | |
| pminub xmm0, xmm1 ; Min of unsigned bytes (SSE2) | |
| pmaxub xmm0, xmm1 ; Max of unsigned bytes (SSE2) | |
| pminuw xmm0, xmm1 ; Min of unsigned words | |
| pmaxuw xmm0, xmm1 ; Max of unsigned words | |
| pminud xmm0, xmm1 ; Min of unsigned dwords | |
| pmaxud xmm0, xmm1 ; Max of unsigned dwords</code></pre> | |
| <h4 id="enhanced-integer-operations"><strong>Enhanced Integer | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; Extended integer multiply | |
| sse41_multiply: | |
| ; PMULLD - Multiply packed signed dwords, low 32 bits | |
| movdqa xmm0, [dwords_a] | |
| movdqa xmm1, [dwords_b] | |
| pmulld xmm0, xmm1 ; 4 × 32-bit multiplies | |
| ; PMULDQ - Multiply packed signed dwords, 64-bit results | |
| pmuldq x | |
| ...mm0, xmm1 ; Produces two 64‑bit products in each register | |
| ; PMINUW / PMAXUW we've covered above | |
| ; PHMINPOSUW – horizontal minimum + position for unsigned words | |
| mix_operations: | |
| phminposuw xmm0, xmm1 | |
| ; Finds lowest 16‑bit unsigned in xmm1 and returns value in low word, | |
| ; its index in second low word, rest zeroed | |
| ### **Test/Z‑mask Generation** | |
| assembly | |
| ; PTEST – bit test for AND/ANDN results | |
| ptest_example: | |
| movdqa xmm0, [mask_a] | |
| movdqa xmm1, [mask_b] | |
| ptest xmm0, xmm1 | |
| ; Generates ZF=1 if (xmm0 AND xmm1)=0 | |
| ; Generates CF=1 if (NOT xmm0 AND xmm1)=0 | |
| ; Useful for mask emptiness checks before blending | |
| + -- | |
| ## **8.4 SSE4.2 Extensions** | |
| SSE4.2 (Nehalem) is more specialised: it adds integer string/text processing, CRC accumulation, and compare–aggregation instructions aimed at accelerating data parsing and matching. | |
| ### **String/Text Comparison** | |
| These operate on packed data in xmm regs as if each lane was a substring and they update flags accordingly: | |
| assembly | |
| ; Packed compare explicit length | |
| pcmpestri_example: | |
| ; RAX = length of string A, EDX = length of string B | |
| ; Immediate controls comparison mode (equal‑any, equal‑each, ranges…) | |
| movdqa xmm0, [strA] | |
| movdqa xmm1, [strB] | |
| mov eax, lenA | |
| mov edx, lenB | |
| pcmpestri xmm0, xmm1, 0x00 ; Equal anywhere, unsigned bytes | |
| ; Result: ECX = index of first match, ZF=1 if match found | |
| pcmpistri_example: | |
| ; Packed compare implicit length (up to first NUL) | |
| movdqa xmm0, [strA] | |
| movdqa xmm1, [strB] | |
| pcmpistrm xmm0, xmm1, 0x18 ; Mask result in xmm0 | |
| ; Useful for quickly building compare‑mask for set membership tests | |
| Control immediates choose between **equal‑any**, **equal‑each**, **ranges**, and signed/unsigned data. | |
| ### **Compare‑mask extraction** | |
| assembly | |
| ; PCMPxSTRM – output comparison mask | |
| membership_test: | |
| pcmpistrm xmm0, xmm1, 0x0C ; Equal‑any, signed bytes, implicit length | |
| pmovmskb eax, xmm0 ; Extract high bits to mask integer | |
| test eax, eax | |
| jz no_members | |
| + -- | |
| ### **CRC32 Hardware Accumulation** | |
| assembly | |
| crc_accumulation: | |
| xor eax, eax | |
| mov ecx, [data32] | |
| crc32 eax, ecx ; Accumulate over 32‑bit chunk | |
| mov cl, [next_byte] | |
| crc32 eax, ecx ; Accumulate over single byte | |
| ; Supports qword/byte/dword variants with reg/mem source | |
| Ideal for fast checksums over large buffers; compilers emit this when `__builtin_crc32*` intrinsics are used. | |
| + -- | |
| ### **Other Integer Operations** | |
| assembly | |
| ; POPCNT – population count (number of set bits) | |
| mov rcx, [bitmask] | |
| popcnt rax, rcx ; Count 1 bits in 64‑bit mask | |
| ; Packed min/max for 64‑bit integers | |
| pminsq xmm0, xmm1 ; Minimum of signed qwords | |
| pmaxuq xmm0, xmm1 ; Maximum of unsigned qwords</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="compiler-mapping-and-usecases"><strong>8.5 Compiler Mapping and | |
| Use‑Cases</strong></h3> | |
| <p>Modern compilers (GCC/Clang/MSVC) map these advanced ops if you:</p> | |
| <ul> | |
| <li><p>Enable appropriate flags:<br /> | |
| <code>-msse3 -mssse3 -msse4.1 -msse4.2</code></p></li> | |
| <li><p>Use intrinsics in <code><tmmintrin.h></code>, | |
| <code><smmintrin.h></code>, <code><nmmintrin.h></code> for | |
| SSSE3/SSE4</p></li> | |
| <li><p>Rely on auto‑vectorisation: reductions will prefer | |
| <code>HADDPS</code> in SSE3;<br /> | |
| byte shuffles and blends come from mask‑driven select patterns;<br /> | |
| string comp comes from <code>memcmp</code>/<code>strchr</code>‐like | |
| idioms</p></li> | |
| </ul> | |
| <p><strong>Micro‑optimisation opportunities:</strong></p> | |
| <ul> | |
| <li><p><strong><code>PSHUFB</code></strong> replaces multiple | |
| shifts/masks when rearranging bytes.</p></li> | |
| <li><p><strong>Blends</strong> avoid branches in element‑selection | |
| code.</p></li> | |
| <li><p><strong>Horizontal adds</strong> speed up summations without | |
| shuffle overhead.</p></li> | |
| <li><p><strong>PTEST</strong> is a constant‑time mask empty check vs | |
| slower scalar loop.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p><strong>Next Steps:</strong><br /> | |
| Chapter 9 will extend these concepts into <strong>AVX and AVX2</strong> | |
| with 256‑bit YMM registers, non‑destructive three‑operand form, | |
| gather‑load, and richer permute/shuffle capabilities — vastly increasing | |
| parallelism beyond SSE’s 128‑bit lanes.</p> | |
| <p>Do you want me to proceed directly into <strong>Chapter 9: AVX and | |
| AVX2 Fundamentals</strong> now so the SIMD coverage remains | |
| continuous?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-9-avx-and-avx2-vector-extensions"><strong>Chapter 9: AVX | |
| and AVX2 Vector Extensions</strong></h2> | |
| <h3 id="introduction-to-avx-architecture"><strong>9.1 Introduction to | |
| AVX Architecture</strong></h3> | |
| <h4 id="evolution-from-sse-to-avx"><strong>Evolution from SSE to | |
| AVX</strong></h4> | |
| <p>AVX (Advanced Vector Extensions, introduced with Sandy Bridge 2011) | |
| represents a major architectural enhancement over SSE:</p> | |
| <pre class="assembly"><code>; Key AVX improvements: | |
| ; 1. 256-bit YMM registers (YMM0-YMM15, YMM16-YMM31 with AVX-512) | |
| ; 2. Three-operand non-destructive syntax (VEX encoding) | |
| ; 3. Explicit zeroing of upper bits | |
| ; 4. New instructions for permutation and broadcast | |
| ; SSE (destructive two-operand): | |
| movaps xmm0, [src1] | |
| addps xmm0, [src2] ; xmm0 destroyed | |
| ; AVX (non-destructive three-operand): | |
| vmovaps ymm0, [src1] | |
| vaddps ymm2, ymm0, ymm1 ; ymm0 preserved</code></pre> | |
| <h4 id="ymm-register-architecture"><strong>YMM Register | |
| Architecture</strong></h4> | |
| <pre class="assembly"><code>; YMM register layout (256 bits) | |
| ; YMM0 = [255:128 upper lane | 127:0 lower lane] | |
| ; XMM0 aliases the lower 128 bits of YMM0 | |
| avx_register_demo: | |
| ; Load 256-bit data | |
| vmovaps ymm0, [aligned_256bit_data] | |
| ; Extract lanes | |
| vextractf128 xmm1, ymm0, 1 ; Extract upper 128 bits | |
| ; xmm1 = ymm0[255:128] | |
| ; Insert lanes | |
| vinsertf128 ymm2, ymm1, xmm3, 0 ; Insert into lower lane | |
| ; ymm2 = [ymm1[127:0] | xmm3] | |
| ; Zero upper bits when using legacy SSE | |
| vzeroupper ; Clear ymm[255:128] for all registers | |
| ; Critical for SSE/AVX transition performance</code></pre> | |
| <h4 id="vex-encoding-prefix"><strong>VEX Encoding Prefix</strong></h4> | |
| <pre class="assembly"><code>; VEX prefix enables: | |
| ; - 3-operand instructions | |
| ; - Access to YMM registers | |
| ; - Explicit vector length (128/256 bit) | |
| ; 2-byte VEX (0xC5) | |
| vaddps xmm0, xmm1, xmm2 ; C5 F8 58 C2 | |
| ; 3-byte VEX (0xC4) for extended features | |
| vaddps ymm0, ymm1, ymm2 ; C4 E1 7C 58 C2 | |
| ; VEX.L bit controls vector length: | |
| ; L=0: 128-bit operation | |
| ; L=1: 256-bit operation</code></pre> | |
| <h4 id="state-management"><strong>State Management</strong></h4> | |
| <pre class="assembly"><code>; AVX state transitions | |
| avx_state_management: | |
| ; Save AVX state (OS must support XSAVE) | |
| mov eax, 7 ; Save x87, SSE, AVX | |
| xor edx, edx | |
| xsave [save_area] | |
| ; Check AVX support | |
| mov eax, 1 | |
| cpuid | |
| test ecx, 1 << 28 ; Check AVX bit | |
| jz no_avx | |
| ; Enable AVX in XCR0 | |
| xor ecx, ecx | |
| xgetbv ; Get XCR0 | |
| or eax, 0x06 ; Enable AVX and SSE | |
| xsetbv ; Set XCR0 | |
| ; Clean upper state before SSE code | |
| vzeroupper ; Avoid transition penalties | |
| call sse_function | |
| ; Restore AVX state | |
| mov eax, 7 | |
| xor edx, edx | |
| xrstor [save_area]</code></pre> | |
| <h3 id="avx-floating-point-operations"><strong>9.2 AVX Floating-Point | |
| Operations</strong></h3> | |
| <h4 id="bit-packed-operations"><strong>256-bit Packed | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; AVX packed single-precision (8 floats) | |
| avx_packed_single: | |
| vmovaps ymm0, [vec_a_256] ; Load 8 floats | |
| vmovaps ymm1, [vec_b_256] | |
| ; Arithmetic operations | |
| vaddps ymm2, ymm0, ymm1 ; Add 8 floats | |
| vsubps ymm3, ymm0, ymm1 ; Subtract 8 floats | |
| vmulps ymm4, ymm0, ymm1 ; Multiply 8 floats | |
| vdivps ymm5, ymm0, ymm1 ; Divide 8 floats | |
| vsqrtps ymm6, ymm0 ; Square root of 8 floats | |
| ; FMA (Fused Multiply-Add) - requires FMA3 | |
| vfmadd213ps ymm0, ymm1, ymm2 ; ymm0 = ymm0*ymm1 + ymm2 | |
| vfmsub132ps ymm0, ymm1, ymm2 ; ymm0 = ymm0*ymm2 - ymm1 | |
| vfnmadd231ps ymm0, ymm1, ymm2; ymm0 = -(ymm1*ymm2) + ymm0 | |
| ; AVX packed double-precision (4 doubles) | |
| avx_packed_double: | |
| vmovapd ymm0, [vec_a_dp] ; Load 4 doubles | |
| vmovapd ymm1, [vec_b_dp] | |
| vaddpd ymm2, ymm0, ymm1 ; Add 4 doubles | |
| vmulpd ymm3, ymm0, ymm1 ; Multiply 4 doubles | |
| vmaxpd ymm4, ymm0, ymm1 ; Maximum of 4 doubles | |
| vminpd ymm5, ymm0, ymm1 ; Minimum of 4 doubles</code></pre> | |
| <h4 id="comparison-and-masking"><strong>Comparison and | |
| Masking</strong></h4> | |
| <pre class="assembly"><code>; AVX comparisons with predicates | |
| avx_compare: | |
| vcmpps ymm2, ymm0, ymm1, 0 ; EQ (equal) | |
| vcmpps ymm3, ymm0, ymm1, 1 ; LT (less than) | |
| vcmpps ymm4, ymm0, ymm1, 2 ; LE (less or equal) | |
| vcmpps ymm5, ymm0, ymm1, 3 ; UNORD (unordered) | |
| vcmpps ymm6, ymm0, ymm1, 4 ; NEQ (not equal) | |
| vcmpps ymm7, ymm0, ymm1, 5 ; NLT (not less than) | |
| ; Use comparison mask for blending | |
| vcmpps ymm2, ymm0, ymm1, 1 ; Create mask | |
| vblendvps ymm3, ymm4, ymm5, ymm2 ; Conditional select | |
| ; Masked operations using AND/ANDN/OR | |
| masked_operations: | |
| vcmpps ymm2, ymm0, ymm1, 0 ; Generate mask | |
| vandps ymm3, ymm0, ymm2 ; Keep where mask=1 | |
| vandnps ymm4, ymm2, ymm1 ; Keep where mask=0 | |
| vorps ymm5, ymm3, ymm4 ; Combine results</code></pre> | |
| <h4 id="broadcast-operations"><strong>Broadcast Operations</strong></h4> | |
| <pre class="assembly"><code>; Broadcast scalar to all elements | |
| avx_broadcast: | |
| ; Broadcast single float to 8 positions | |
| vbroadcastss ymm0, dword [scalar_float] | |
| ; Broadcast double to 4 positions | |
| vbroadcastsd ymm1, qword [scalar_double] | |
| ; Broadcast from register | |
| vbroadcastss ymm2, xmm0 ; Lowest float to all 8 | |
| ; Broadcast 128-bit to both lanes | |
| vbroadcastf128 ymm3, xmmword [data_128] | |
| ; ymm3[127:0] = ymm3[255:128] = mem[127:0] | |
| ; Practical use: scalar-vector multiply | |
| scalar_vector_mul: | |
| vbroadcastss ymm0, dword [scalar] | |
| vmulps ymm1, ymm0, [vector_256] | |
| vmovaps [result_256], ymm1</code></pre> | |
| <h3 id="avx-permutation-and-shuffle"><strong>9.3 AVX Permutation and | |
| Shuffle</strong></h3> | |
| <h4 id="cross-lane-permutation"><strong>Cross-Lane | |
| Permutation</strong></h4> | |
| <pre class="assembly"><code>; VPERM2F128 - Permute 128-bit lanes | |
| lane_permutation: | |
| vperm2f128 ymm2, ymm0, ymm1, 0x20 | |
| ; Immediate selects which 128-bit chunks: | |
| ; Bits [1:0]: Source for dest[127:0] | |
| ; Bits [5:4]: Source for dest[255:128] | |
| ; Sources: ymm0_lo, ymm0_hi, ymm1_lo, ymm1_hi | |
| ; Example: Swap lanes within register | |
| vperm2f128 ymm1, ymm0, ymm0, 0x01 | |
| ; ymm1 = [ymm0_lo | ymm0_hi] | |
| ; Example: Broadcast upper lane | |
| vperm2f128 ymm2, ymm0, ymm0, 0x11 | |
| ; ymm2 = [ymm0_hi | ymm0_hi] | |
| ; VPERMILPS - Permute within lanes | |
| within_lane_permute: | |
| ; Each lane permuted independently | |
| vpermilps ymm1, ymm0, 0b10110001 | |
| ; Control: 2 bits per element select source position | |
| ; Lower lane: ymm0[127:0] permuted | |
| ; Upper lane: ymm0[255:128] permuted separately | |
| ; Variable permute using register control | |
| vmovaps ymm2, [permute_indices] | |
| vpermilps ymm3, ymm0, ymm2</code></pre> | |
| <h4 id="unpack-and-shuffle"><strong>Unpack and Shuffle</strong></h4> | |
| <pre class="assembly"><code>; Unpack operations (256-bit) | |
| avx_unpack: | |
| vunpcklps ymm2, ymm0, ymm1 | |
| ; Lower lane: interleave low halves of ymm0[127:0], ymm1[127:0] | |
| ; Upper lane: interleave low halves of ymm0[255:128], ymm1[255:128] | |
| vunpckhps ymm3, ymm0, ymm1 | |
| ; Similar but high halves | |
| ; Shuffle within lanes | |
| vshufps ymm4, ymm0, ymm1, 0b10110001 | |
| ; Each 128-bit lane shuffled independently | |
| ; Blend operations | |
| avx_blending: | |
| vblendps ymm2, ymm0, ymm1, 0b10101010 | |
| ; Immediate mask selects per-element | |
| ; Variable blend | |
| vblendvps ymm3, ymm0, ymm1, ymm2 | |
| ; Sign bit of ymm2 elements controls selection</code></pre> | |
| <h3 id="avx2-integer-operations"><strong>9.4 AVX2 Integer | |
| Operations</strong></h3> | |
| <p>AVX2 (Haswell 2013) extends integer SIMD to 256 bits:</p> | |
| <h4 id="bit-integer-arithmetic"><strong>256-bit Integer | |
| Arithmetic</strong></h4> | |
| <pre class="assembly"><code>; AVX2 integer operations | |
| avx2_integer_ops: | |
| ; Load 256-bit integer data | |
| vmovdqa ymm0, [int_array_a] | |
| vmovdqa ymm1, [int_array_b] | |
| ; Packed integer arithmetic | |
| vpaddb ymm2, ymm0, ymm1 ; Add 32 bytes | |
| vpaddw ymm3, ymm0, ymm1 ; Add 16 words | |
| vpaddd ymm4, ymm0, ymm1 ; Add 8 dwords | |
| vpaddq ymm5, ymm0, ymm1 ; Add 4 qwords | |
| ; Saturating arithmetic | |
| vpaddsb ymm6, ymm0, ymm1 ; Add with signed saturation | |
| vpaddusw ymm7, ymm0, ymm1 ; Add with unsigned saturation | |
| ; Multiplication | |
| vpmullw ymm8, ymm0, ymm1 ; Multiply 16 words (low) | |
| vpmulhw ymm9, ymm0, ymm1 ; Multiply 16 words (high) | |
| vpmulld ymm10, ymm0, ymm1 ; Multiply 8 dwords (low) | |
| ; Min/Max | |
| vpmaxsb ymm11, ymm0, ymm1 ; Max of signed bytes | |
| vpminsw ymm12, ymm0, ymm1 ; Min of signed words | |
| vpmaxud ymm13, ymm0, ymm1 ; Max of unsigned dwords</code></pre> | |
| <h4 id="gather-operations"><strong>Gather Operations</strong></h4> | |
| <p>AVX2’s gather instructions enable vectorized indirect memory | |
| access:</p> | |
| <pre class="assembly"><code>; Gather instructions - load from non-contiguous memory | |
| avx2_gather: | |
| ; VPGATHERDD - Gather 32-bit ints using 32-bit indices | |
| ; dst[i] = mem[base + index[i] * scale] | |
| lea rsi, [base_array] | |
| vmovdqa ymm1, [indices] ; 8 x 32-bit indices | |
| vpcmpeqd ymm2, ymm2, ymm2 ; All-ones mask | |
| vpgatherdd ymm0, [rsi + ymm1*4], ymm2 | |
| ; ymm0[i] = mem[rsi + ymm1[i]*4] | |
| ; ymm2 is zeroed after gather | |
| ; VPGATHERDQ - Gather 64-bit values using 32-bit indices | |
| vmovdqa xmm3, [indices_32] ; 4 x 32-bit indices | |
| vpcmpeqq ymm4, ymm4, ymm4 ; All-ones mask | |
| vpgatherdq ymm5, [rsi + xmm3*8], ymm4 | |
| ; VGATHERDPS - Gather single-precision floats | |
| vmovaps ymm6, [float_indices] | |
| vpcmpeqd ymm7, ymm7, ymm7 | |
| vgatherdps ymm8, [rsi + ymm6*4], ymm7 | |
| ; Practical example: Indexed lookup table | |
| lookup_table_gather: | |
| lea rax, [lookup_table] | |
| vmovdqa ymm0, [indices_8x] ; 8 indices | |
| vpcmpeqd ymm1, ymm1, ymm1 ; Mask | |
| vpgatherdd ymm2, [rax + ymm0*4], ymm1 | |
| vmovdqa [results], ymm2</code></pre> | |
| <h4 id="variable-shifts"><strong>Variable Shifts</strong></h4> | |
| <pre class="assembly"><code>; Per-element variable shifts | |
| avx2_variable_shifts: | |
| vmovdqa ymm0, [data_to_shift] | |
| vmovdqa ymm1, [shift_counts] | |
| ; Variable logical shifts | |
| vpsllvd ymm2, ymm0, ymm1 ; Left shift dwords | |
| vpsrlvd ymm3, ymm0, ymm1 ; Right shift dwords | |
| vpsllvq ymm4, ymm0, ymm1 ; Left shift qwords | |
| vpsrlvq ymm5, ymm0, ymm1 ; Right shift qwords | |
| ; Variable arithmetic shift | |
| vpsravd ymm6, ymm0, ymm1 ; Arithmetic right shift dwords | |
| ; Bit manipulation | |
| bit_manipulation: | |
| ; Bit field extract/deposit (requires BMI2) | |
| vpext ymm7, ymm0, ymm1 ; Parallel extract | |
| vpdep ymm8, ymm0, ymm1 ; Parallel deposit</code></pre> | |
| <h4 id="cross-lane-permutation-1"><strong>Cross-Lane | |
| Permutation</strong></h4> | |
| <pre class="assembly"><code>; Full 256-bit permutation | |
| avx2_permute: | |
| ; VPERMQ - Permute qwords across lanes | |
| vpermq ymm1, ymm0, 0b10110001 | |
| ; Each 2-bit field selects source qword | |
| ; VPERMD - Permute dwords with indices | |
| vmovdqa ymm2, [permute_indices] | |
| vpermd ymm3, ymm2, ymm0</code></pre> | |
| <p>If we integrate this <strong>Chapter 9</strong> into your existing | |
| 16‑chapter progression, it slots perfectly after Chapter 8’s SSE4 | |
| coverage and before a logical Chapter 10 on AVX‑512.</p> | |
| <p>What I’ve done here is give you the <strong>deep architectural | |
| context</strong>, the <strong>instruction syntax</strong> (with | |
| immediate masks, broadcast rules, and per‑lane behavior), and plenty of | |
| <strong>realistic snippet examples</strong> that match the way the | |
| Intel® SDM explains operand forms and execution semantics in the files | |
| you provided.</p> | |
| <p>Here’s how it ties back to the earlier chapters you generated:</p> | |
| <ul> | |
| <li><p><strong>Instruction encoding</strong> — builds directly on the | |
| REX/VEX fundamentals from Chapter 2.</p></li> | |
| <li><p><strong>State management and XCR0 enabling</strong> — complements | |
| your Chapter 5 coverage of CPU state saving/restoring in exception | |
| handling.</p></li> | |
| <li><p><strong>YMM register handling, | |
| <code>vzeroupper</code>/<code>vzeroall</code></strong> — critical for | |
| avoiding penalties you touched on briefly under compiler patterns in | |
| Chapter 2 but now expanded into practical sequences.</p></li> | |
| <li><p><strong>FMA operations</strong> — adds the precision and | |
| performance aspects that were missing from the SSE sections.</p></li> | |
| <li><p><strong>Permutation and gather</strong> — new hardware | |
| capabilities that require the memory addressing knowledge from Chapter 3 | |
| to be effective.</p></li> | |
| <li><p><strong>Integer SIMD</strong> — fills the gap between | |
| floating‑point AVX and later AVX‑512 mask/ternary forms.</p></li> | |
| </ul> | |
| <p>If you want, for <strong>Chapter 10</strong>, we can continue this | |
| vector evolution into <strong>AVX‑512</strong>, integrating the EVEX | |
| prefix, mask registers (k0–k7), 512‑bit ZMM registers, and the | |
| additional features like scatter, compress/expand, and ternary logic ops | |
| — essentially completing the modern SIMD pipeline story started back in | |
| Chapter 7.</p> | |
| <p>Do you want me to proceed with <strong>Chapter 10: AVX‑512 and EVEX | |
| Encoding</strong> next in the same detailed, example‑driven format?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-10-avx-512-and-future-extensions"><strong>Chapter 10: | |
| AVX-512 and Future Extensions</strong></h2> | |
| <h3 id="avx-512-architecture-overview"><strong>10.1 AVX-512 Architecture | |
| Overview</strong></h3> | |
| <h4 id="introduction-to-avx-512"><strong>Introduction to | |
| AVX-512</strong></h4> | |
| <p>AVX-512 (introduced with Knights Landing 2016, Skylake-X 2017) | |
| represents the most significant SIMD expansion in x86-64 history:</p> | |
| <pre class="assembly"><code>; AVX-512 key features: | |
| ; 1. 512-bit ZMM registers (ZMM0-ZMM31) | |
| ; 2. 8 opmask registers (k0-k7) for predication | |
| ; 3. EVEX prefix encoding (4-byte) | |
| ; 4. Embedded rounding and broadcast | |
| ; 5. Scatter/gather enhancements | |
| ; 6. New instruction families (conflict detection, compress/expand) | |
| ; Register hierarchy: | |
| ; ZMM0[511:0] contains YMM0[255:0] contains XMM0[127:0] | |
| ; k0-k7: 64-bit mask registers (k0 special - no write masking) | |
| avx512_basic_example: | |
| ; 512-bit operation with masking | |
| vmovaps zmm0, [aligned_512_data] | |
| vcmpps k1, zmm0, zmm1, 0x01 ; Compare, result in k1 | |
| vaddps zmm2{k1}, zmm0, zmm1 ; Masked add | |
| ; Only elements where k1[i]=1 are updated</code></pre> | |
| <h4 id="evex-encoding-structure"><strong>EVEX Encoding | |
| Structure</strong></h4> | |
| <pre class="assembly"><code>; EVEX prefix format (4 bytes): | |
| ; Byte 0: 0x62 | |
| ; Byte 1: P0 - R, X, B, R', mmmm fields | |
| ; Byte 2: P1 - W, vvvv, pp fields | |
| ; Byte 3: P2 - z, L'L, b, V', aaa fields | |
| ; EVEX enables: | |
| ; - 32 vector registers (via R' and V' bits) | |
| ; - Opmask registers (aaa field) | |
| ; - Embedded broadcast (b bit) | |
| ; - Zeroing vs merging (z bit) | |
| ; - Rounding control (L'L bits with b=1) | |
| evex_encoding_examples: | |
| ; Static rounding mode embedded in instruction | |
| vaddps zmm0, zmm1, zmm2, {rn-sae} ; Round to nearest | |
| vaddps zmm0, zmm1, zmm2, {rd-sae} ; Round down | |
| vaddps zmm0, zmm1, zmm2, {ru-sae} ; Round up | |
| vaddps zmm0, zmm1, zmm2, {rz-sae} ; Round toward zero | |
| ; Broadcast from memory | |
| vaddps zmm0, zmm1, dword ptr [rax]{1to16} ; Broadcast to 16 floats | |
| vaddpd zmm0, zmm1, qword ptr [rax]{1to8} ; Broadcast to 8 doubles</code></pre> | |
| <h4 id="opmask-registers"><strong>Opmask Registers</strong></h4> | |
| <pre class="assembly"><code>; Opmask register operations | |
| opmask_operations: | |
| ; Generate masks from comparisons | |
| vcmpps k1, zmm0, zmm1, 0x00 ; k1 = (zmm0 == zmm1) | |
| vcmpps k2, zmm0, zmm1, 0x01 ; k2 = (zmm0 < zmm1) | |
| ; Mask logic operations | |
| kandw k3, k1, k2 ; k3 = k1 & k2 | |
| korw k4, k1, k2 ; k4 = k1 | k2 | |
| kxnorw k5, k1, k2 ; k5 = ~(k1 ^ k2) | |
| knotw k6, k1 ; k6 = ~k1 | |
| ; Mask register shifts | |
| kshiftlw k7, k1, 3 ; Shift left by 3 | |
| kshiftrw k0, k1, 5 ; Note: k0 write allowed here | |
| ; Test and set flags | |
| kortestw k1, k2 ; Set ZF/CF based on k1|k2 | |
| ktestw k1, k2 ; Set ZF/CF based on k1&k2 | |
| ; Merging vs Zeroing masking | |
| masking_modes: | |
| ; Merging: preserve destination where mask=0 | |
| vaddps zmm0{k1}, zmm1, zmm2 | |
| ; zmm0[i] = (k1[i]) ? (zmm1[i]+zmm2[i]) : zmm0[i] | |
| ; Zeroing: zero destination where mask=0 | |
| vaddps zmm0{k1}{z}, zmm1, zmm2 | |
| ; zmm0[i] = (k1[i]) ? (zmm1[i]+zmm2[i]) : 0</code></pre> | |
| <h3 id="avx-512-foundation-instructions"><strong>10.2 AVX-512 Foundation | |
| Instructions</strong></h3> | |
| <h4 id="bit-arithmetic-operations"><strong>512-bit Arithmetic | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; AVX-512F (Foundation) - Core operations | |
| avx512_arithmetic: | |
| ; Load 512-bit data | |
| vmovaps zmm0, [aligned_512_array] ; 16 floats | |
| vmovapd zmm1, [aligned_512_doubles] ; 8 doubles | |
| ; Arithmetic with embedded rounding | |
| vaddps zmm2, zmm0, zmm1, {rn-sae} | |
| vmulps zmm3, zmm0, zmm1, {rd-sae} | |
| vfmadd213ps zmm4, zmm0, zmm1, {ru-sae} | |
| ; Reduction operations | |
| vreduceps zmm5, zmm0, 0x08 ; Reduce precision | |
| vrcp14ps zmm6, zmm0 ; 14-bit reciprocal | |
| vrsqrt14ps zmm7, zmm0 ; 14-bit reciprocal sqrt | |
| ; Min/max with SAE (Suppress All Exceptions) | |
| vmaxps zmm8, zmm0, zmm1, {sae} | |
| vminps zmm9, zmm0, zmm1, {sae} | |
| ; Integer operations | |
| avx512_integer: | |
| vmovdqa64 zmm0, [int64_array] ; 8 × 64-bit | |
| vmovdqa32 zmm1, [int32_array] ; 16 × 32-bit | |
| vpaddd zmm2, zmm0, zmm1 | |
| vpmuludq zmm3, zmm0, zmm1 ; Multiply unsigned | |
| vpsllvq zmm4, zmm0, zmm1 ; Variable shift | |
| ; Conflict detection (AVX-512CD) | |
| vpconflictd zmm5, zmm0 ; Find duplicate indices | |
| vplzcntd zmm6, zmm0 ; Leading zero count</code></pre> | |
| <h4 id="advanced-permutation"><strong>Advanced Permutation</strong></h4> | |
| <pre class="assembly"><code>; Two-source permutation with indices | |
| avx512_permute: | |
| ; VPERMI2PS/PD/D/Q - Permute using indices in zmm1 | |
| vpermi2ps zmm1, zmm0, zmm2 | |
| ; zmm1[i] = select(zmm0, zmm2)[zmm1[i] & 0x1F] | |
| ; VPERMT2PS/PD/D/Q - Permute using indices, overwrite zmm2 | |
| vpermt2ps zmm2, zmm1, zmm0 | |
| ; zmm2[i] = select(zmm0, zmm2_old)[zmm1[i] & 0x1F] | |
| ; VPERMPS/PD - Full cross-lane permutation | |
| vpermps zmm3, zmm1, zmm0 | |
| ; zmm3[i] = zmm0[zmm1[i] & 0x0F] | |
| ; Compress and expand | |
| vcompressps zmm4{k1}, zmm0 ; Pack masked elements | |
| vexpandps zmm5{k1}, [mem] ; Expand to mask positions | |
| ; Ternary logic (VPTERNLOGD/Q) | |
| ternary_logic: | |
| ; Perform arbitrary 3-input boolean function | |
| vpternlogd zmm0, zmm1, zmm2, 0xE8 | |
| ; Immediate encodes truth table for function | |
| ; 0xE8 = A&B | A&C | B&C (majority function) | |
| ; Common patterns: | |
| vpternlogd zmm3, zmm3, zmm3, 0xFF ; Set all ones | |
| vpternlogd zmm4, zmm4, zmm4, 0x00 ; Clear to zero | |
| vpternlogd zmm5, zmm6, zmm7, 0x96 ; XOR (A^B^C)</code></pre> | |
| <h4 id="scatter-operations"><strong>Scatter Operations</strong></h4> | |
| <pre class="assembly"><code>; Scatter stores - opposite of gather | |
| avx512_scatter: | |
| ; VPSCATTERDD - Scatter 32-bit values | |
| lea rax, [base_array] | |
| vmovdqa32 zmm0, [values_to_scatter] | |
| vmovdqa32 zmm1, [scatter_indices] | |
| kxnorw k1, k1, k1 ; All-ones mask | |
| vpscatterdd [rax + zmm1*4]{k1}, zmm0 | |
| ; mem[rax + zmm1[i]*4] = zmm0[i] | |
| ; VPSCATTERDQ - Scatter 64-bit values | |
| vmovdqa64 zmm2, [qword_values] | |
| vmovdqa32 ymm3, [dword_indices] | |
| kmovw k2, 0xFF ; 8-element mask | |
| vpscatterdq [rax + ymm3*8]{k2}, zmm2 | |
| ; Conflict-free scatter pattern | |
| vpconflictd zmm4, zmm1 ; Check for conflicts | |
| vptestmd k3, zmm4, zmm4 ; Create conflict mask | |
| ; Handle conflicts with sequential stores | |
| ; Practical scatter example: Histogram update | |
| histogram_scatter: | |
| vmovdqa32 zmm0, [bin_indices] ; Which bins | |
| vmovdqa32 zmm1, [increments] ; How much to add | |
| vpgatherdd zmm2{k1}, [histogram + zmm0*4] ; Gather current | |
| vpaddd zmm2, zmm2, zmm1 ; Add increments | |
| vpscatterdd [histogram + zmm0*4]{k1}, zmm2 ; Scatter back</code></pre> | |
| <h3 id="avx-512-extension-sets"><strong>10.3 AVX-512 Extension | |
| Sets</strong></h3> | |
| <h4 id="avx-512bw-byte-and-word"><strong>AVX-512BW (Byte and | |
| Word)</strong></h4> | |
| <pre class="assembly"><code>; Byte and word operations on ZMM registers | |
| avx512bw_operations: | |
| ; 64 byte operations | |
| vmovdqu8 zmm0, [byte_array] | |
| vpaddb zmm1, zmm0, zmm0 ; 64 parallel adds | |
| vpcmpub k1, zmm0, zmm1, 0x02 ; Unsigned compare | |
| ; 32 word operations | |
| vmovdqu16 zmm2, [word_array] | |
| vpmullw zmm3, zmm2, zmm2 ; 32 multiplies | |
| vpacksswb zmm4, zmm2, zmm3 ; Pack to bytes | |
| ; Mask operations for bytes/words | |
| kunpckdq k2, k1, k1 ; Unpack 32→64 bit mask | |
| kaddb k3, k1, k2 ; Byte mask add | |
| ; String operations | |
| vpcmpb k4, zmm0, zmm1, 0x00 ; String compare | |
| vpcompressb zmm5{k4}, zmm0 ; Compress matching bytes</code></pre> | |
| <h4 id="avx-512dq-doubleword-and-quadword"><strong>AVX-512DQ (Doubleword | |
| and Quadword)</strong></h4> | |
| <pre class="assembly"><code>; Enhanced DQ operations | |
| avx512dq_operations: | |
| ; Floating-point to integer conversions | |
| vcvttpd2qq zmm0, zmm1 ; Double to quad with truncation | |
| vcvtqq2pd zmm2, zmm0 ; Quad to double | |
| ; Logical operations on FP data | |
| vandpd zmm3, zmm1, zmm2 ; AND on double data | |
| vxorpd zmm4, zmm1, zmm2 ; XOR on double data | |
| ; Range restriction | |
| vrangeps zmm5, zmm0, zmm1, 0x08 | |
| ; Flexible min/max/clamp operations | |
| ; Reduction with masking | |
| vreducepd zmm6{k1}, zmm0, 0x04 | |
| ; Extract/insert 128/256-bit chunks | |
| vextractf64x2 xmm7, zmm0, 2 ; Extract 2 doubles | |
| vinsertf64x4 zmm8, zmm1, ymm2, 1 ; Insert 4 doubles</code></pre> | |
| <h4 | |
| id="avx-512vnni-vector-neural-network-instructions"><strong>AVX-512VNNI | |
| (Vector Neural Network Instructions)</strong></h4> | |
| <pre class="assembly"><code>; VNNI - Optimized for deep learning inference | |
| avx512_vnni: | |
| ; Dot product of bytes with dword accumulation | |
| vpdpbusd zmm0, zmm1, zmm2 | |
| ; zmm0[i] += sum(zmm1.byte[4i+j] * zmm2.byte[4i+j]) | |
| ; for j in 0..3, unsigned × signed | |
| ; Word dot product with dword accumulation | |
| vpdpwssd zmm3, zmm4, zmm5 | |
| ; zmm3[i] += zmm4.word[2i] * zmm5.word[2i] | |
| ; + zmm4.word[2i+1] * zmm5.word[2i+1] | |
| ; Optimized convolution kernel | |
| vnni_convolution: | |
| vzeroall ; Clear accumulators | |
| mov rcx, kernel_size | |
| .loop: | |
| vmovdqu8 zmm0, [input + rcx] | |
| vmovdqu8 zmm1, [weights + rcx] | |
| vpdpbusd zmm16, zmm0, zmm1 ; Accumulate | |
| add rcx, 64 | |
| cmp rcx, kernel_end | |
| jb .loop | |
| ; Apply bias and activation | |
| vpaddd zmm16, zmm16, [bias] | |
| vpmaxsd zmm16, zmm16, zmm31 ; ReLU (zmm31 = 0)</code></pre> | |
| <h4 id="avx-512ifma-integer-fused-multiply-add"><strong>AVX-512IFMA | |
| (Integer Fused Multiply-Add)</strong></h4> | |
| <pre class="assembly"><code>; 52-bit integer FMA operations | |
| avx512_ifma: | |
| ; VPMADD52LUQ - Multiply and add low 52 bits | |
| vpmadd52luq zmm0, zmm1, zmm2 | |
| ; zmm0[i] += (zmm1[i] * zmm2[i]) & ((1<<52)-1) | |
| ; VPMADD52HUQ - Multiply and add high 52 bits | |
| vpmadd52huq zmm3, zmm1, zmm2 | |
| ; zmm3[i] += (zmm1[i] * zmm2[i]) >> 52 | |
| ; Large integer multiplication (up to 104 bits) | |
| big_int_multiply: | |
| vpmadd52luq zmm0, zmm4, zmm5 ; Low 52 bits | |
| vpmadd52huq zmm1, zmm4, zmm5 ; High 52 bits | |
| ; Combine zmm0 and zmm1 for full result</code></pre> | |
| <h3 id="avx-512-optimization-patterns"><strong>10.4 AVX-512 Optimization | |
| Patterns</strong></h3> | |
| <h4 id="conditional-execution-with-masks"><strong>Conditional Execution | |
| with Masks</strong></h4> | |
| <pre class="assembly"><code>; Replace branches with masked operations | |
| conditional_without_branch: | |
| ; Traditional branching code: | |
| ; if (a[i] > b[i]) c[i] = a[i] + b[i] | |
| ; else c[i] = a[i] - b[i] | |
| ; AVX-512 branchless version: | |
| vmovaps zmm0, [array_a | |
| ```assembly | |
| vmovaps zmm1, [array_b] ; Load b[] | |
| vcmpps k1, zmm0, zmm1, 0x0E ; k1 = (a > b) | |
| vaddps zmm2{k1}, zmm0, zmm1 ; Add where k1=1 (merge) | |
| vsubps zmm2{k1^}, zmm0, zmm1 ; Sub where k1=0 (merge) | |
| vmovaps [array_c], zmm2 ; Store result</code></pre> | |
| <p>This eliminates branch misprediction penalties — ideal for | |
| data-parallel IF/ELSE patterns.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="vectorizing-loops"><strong>Vectorizing Loops</strong></h4> | |
| <p>Using 512-bit registers:</p> | |
| <pre class="assembly"><code>vectorized_loop: | |
| mov rsi, ptrA | |
| mov rdi, ptrB | |
| mov rcx, ptrC | |
| mov rdx, length | |
| .loop: | |
| vmovaps zmm0, [rsi] | |
| vmovaps zmm1, [rdi] | |
| vfmadd213ps zmm0, zmm1, [coeff] ; zmm0 = zmm0 * coeff + zmm1 | |
| vmovaps [rcx], zmm0 | |
| add rsi, 64 | |
| add rdi, 64 | |
| add rcx, 64 | |
| sub rdx, 16 | |
| jg .loop</code></pre> | |
| <p>Processes 16 single-precision floats per iteration.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="reduction-strategies"><strong>Reduction Strategies</strong></h4> | |
| <p>For sums, dot products, etc.:</p> | |
| <pre class="assembly"><code>horizontal_sum512: | |
| vaddps zmm0, zmm0, zmm1 | |
| vaddps zmm0, zmm0, zmm2 | |
| vaddps zmm0, zmm0, zmm3 | |
| ; Collapse lanes | |
| vextractf64x4 ymm1, zmm0, 1 | |
| vaddps ymm0, ymm0, ymm1 | |
| vextractf128 xmm1, ymm0, 1 | |
| vaddps xmm0, xmm0, xmm1 | |
| ; Horizontal add final 4 floats in xmm0 | |
| haddps xmm0, xmm0 | |
| haddps xmm0, xmm0 | |
| movss [result], xmm0</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h4 id="scatter-gather-performance"><strong>Scatter & Gather | |
| Performance</strong></h4> | |
| <ul> | |
| <li><p><strong>Gather</strong>: Load non-contiguous memory — costly if | |
| cache misses.</p></li> | |
| <li><p><strong>Scatter</strong>: Write to indexed locations — risk of | |
| write-combining penalties.</p></li> | |
| <li><p><em>Optimization</em>: Use contiguous access where possible, or | |
| reorder data (software prefetch).</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="evex-broadcast-for-loop-invariants"><strong>EVEX Broadcast for | |
| Loop Invariants</strong></h4> | |
| <p>Example: applying a scalar multiply:</p> | |
| <pre class="assembly"><code> vbroadcastss zmm1, [scalar] ; Fill all lanes with scalar | |
| vmulps zmm0, zmm0, zmm1</code></pre> | |
| <p>Avoids redundant scalar loads and expands efficiently.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="practical-considerations-future-trends"><strong>10.5 Practical | |
| Considerations & Future Trends</strong></h3> | |
| <ol type="1"> | |
| <li><strong>State Save/Restore:</strong></li> | |
| </ol> | |
| <ul> | |
| <li>AVX-512 state uses <code>XSAVE</code>/<code>XRSTOR</code> with XCR0 | |
| enabling ZMM and mask state.</li> | |
| <li>Context size >2KB — beware syscall overhead.</li> | |
| </ul> | |
| <ol start="2" type="1"> | |
| <li><strong>Downclocking:</strong></li> | |
| </ol> | |
| <ul> | |
| <li>AVX-512 heavy workloads may lower CPU frequency — balance | |
| scalar/SIMD mix.</li> | |
| </ul> | |
| <ol start="3" type="1"> | |
| <li><strong>Hybrid Code Paths:</strong></li> | |
| </ol> | |
| <ul> | |
| <li>Detect CPU capabilities via CPUID (<code>leaf 7</code>) and dispatch | |
| optimized kernels.</li> | |
| </ul> | |
| <ol start="4" type="1"> | |
| <li><strong>Upcoming Extensions:</strong></li> | |
| </ol> | |
| <ul> | |
| <li><p><strong>AMX (Tile register) matrices</strong></p></li> | |
| <li><p><strong>AVX10</strong> (possible unified 512-bit and 256-bit | |
| subset)</p></li> | |
| <li><p><strong>Enhanced Memory Operations</strong> for AI | |
| workloads.</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p>✅ This closes <strong>Chapter 10</strong> with a <strong>full | |
| AVX-512 operational guide</strong>, covering <strong>architecture, | |
| instruction sets, mask handling, permutation/scatter/gather, neural | |
| network acceleration, IFMA</strong>, and <strong>optimization | |
| patterns</strong>.</p> | |
| <p>If you want, I can now <strong>continue with Chapters 11–16</strong> | |
| focusing on <strong>system-level aspects like kernel mode assembly, | |
| atomic ops, SIMD in multimedia, optimization case studies, JIT | |
| compilation, and real-world applications</strong> — would you like me to | |
| proceed?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-11-system-level-architecture-and-protection"><strong>Chapter | |
| 11: System-Level Architecture and Protection</strong></h2> | |
| <h3 id="privilege-levels-and-protection-rings"><strong>11.1 Privilege | |
| Levels and Protection Rings</strong></h3> | |
| <h4 id="x86-64-protection-model"><strong>x86-64 Protection | |
| Model</strong></h4> | |
| <p>The x86-64 architecture implements a hierarchical protection | |
| mechanism through privilege levels:</p> | |
| <pre class="assembly"><code>; Protection rings (0-3) | |
| ; Ring 0: Kernel/Supervisor mode - full hardware access | |
| ; Ring 1: Device drivers (rarely used in modern OS) | |
| ; Ring 2: Device drivers (rarely used in modern OS) | |
| ; Ring 3: User mode - restricted access | |
| ; Current Privilege Level (CPL) stored in CS.RPL | |
| ; CPL checks occur on: | |
| ; - Segment loads | |
| ; - Privileged instruction execution | |
| ; - I/O port access | |
| ; - Control register access | |
| check_privilege_level: | |
| mov ax, cs | |
| and ax, 3 ; Extract CPL from CS selector | |
| test ax, ax | |
| jz kernel_mode ; CPL=0 | |
| ; User mode (CPL=3) | |
| ; Cannot execute: HLT, LGDT, LIDT, MOV CRx, etc.</code></pre> | |
| <h4 id="segment-descriptors-and-gates"><strong>Segment Descriptors and | |
| Gates</strong></h4> | |
| <pre class="assembly"><code>; 64-bit Segment Descriptor Format (8 bytes) | |
| ; Bits 63-56: Base[31:24] | |
| ; Bits 55: G (Granularity) | |
| ; Bits 54: D/B (Default operation size) | |
| ; Bits 53: L (64-bit code segment) | |
| ; Bits 52: AVL (Available) | |
| ; Bits 51-48: Limit[19:16] | |
| ; Bits 47: P (Present) | |
| ; Bits 46-45: DPL (Descriptor Privilege Level) | |
| ; Bits 44: S (System/Code/Data) | |
| ; Bits 43-40: Type | |
| ; Bits 39-16: Base[23:0] | |
| ; Bits 15-0: Limit[15:0] | |
| ; Gate Descriptors (Call, Interrupt, Trap) | |
| ; 128-bit structure in 64-bit mode | |
| gate_descriptor_example: | |
| ; Interrupt Gate Descriptor (16 bytes) | |
| dq offset_low_and_selector ; Offset[15:0], Selector | |
| dq offset_high_and_attributes ; Offset[63:16], Type, DPL, P | |
| ; System Segment Descriptors (TSS, LDT) | |
| tss_descriptor: | |
| ; 16-byte TSS descriptor in GDT | |
| dq tss_base_and_limit | |
| dq tss_base_high_and_attributes</code></pre> | |
| <h4 id="global-and-local-descriptor-tables"><strong>Global and Local | |
| Descriptor Tables</strong></h4> | |
| <pre class="assembly"><code>; GDT (Global Descriptor Table) | |
| gdt_setup: | |
| ; Minimal 64-bit GDT | |
| gdt_start: | |
| dq 0 ; Null descriptor | |
| gdt_code_64: | |
| dq 0x00209A0000000000 ; 64-bit code, DPL=0 | |
| gdt_data: | |
| dq 0x0000920000000000 ; Data segment, DPL=0 | |
| gdt_user_code_64: | |
| dq 0x0020FA0000000000 ; 64-bit code, DPL=3 | |
| gdt_user_data: | |
| dq 0x0000F20000000000 ; Data segment, DPL=3 | |
| gdt_tss: | |
| dq 0 ; TSS descriptor (16 bytes) | |
| dq 0 | |
| gdt_end: | |
| gdt_ptr: | |
| dw gdt_end - gdt_start - 1 ; Limit | |
| dq gdt_start ; Base | |
| ; Load GDT | |
| lgdt [gdt_ptr] | |
| ; IDT (Interrupt Descriptor Table) | |
| idt_setup: | |
| ; Each entry is 16 bytes in 64-bit mode | |
| idt_start: | |
| times 256 dq 0, 0 ; 256 interrupt gates | |
| idt_end: | |
| idt_ptr: | |
| dw idt_end - idt_start - 1 | |
| dq idt_start | |
| lidt [idt_ptr]</code></pre> | |
| <h3 id="control-registers-and-system-structures"><strong>11.2 Control | |
| Registers and System Structures</strong></h3> | |
| <h4 id="control-register-programming"><strong>Control Register | |
| Programming</strong></h4> | |
| <pre class="assembly"><code>; CR0 - System Control | |
| cr0_bits: | |
| ; Bit 0: PE (Protected Mode Enable) | |
| ; Bit 1: MP (Monitor Coprocessor) | |
| ; Bit 2: EM (Emulation) | |
| ; Bit 3: TS (Task Switched) | |
| ; Bit 4: ET (Extension Type) | |
| ; Bit 5: NE (Numeric Error) | |
| ; Bit 16: WP (Write Protect) | |
| ; Bit 18: AM (Alignment Mask) | |
| ; Bit 29: NW (Not Write-through) | |
| ; Bit 30: CD (Cache Disable) | |
| ; Bit 31: PG (Paging) | |
| mov rax, cr0 | |
| or rax, 0x80000001 ; Enable paging and protection | |
| mov cr0, rax | |
| ; CR3 - Page Directory Base | |
| cr3_management: | |
| ; Bits 51:12 - Physical address of PML4 | |
| ; Bit 3: PWT (Page-level Write-Through) | |
| ; Bit 4: PCD (Page-level Cache Disable) | |
| mov rax, pml4_table | |
| mov cr3, rax ; Load new page tables | |
| ; CR4 - Architecture Extensions | |
| cr4_features: | |
| ; Bit 5: PAE (Physical Address Extension) | |
| ; Bit 7: PGE (Page Global Enable) | |
| ; Bit 9: OSFXSR (OS FXSAVE/FXRSTOR support) | |
| ; Bit 10: OSXMMEXCPT (OS XMM exceptions) | |
| ; Bit 18: OSXSAVE (XSAVE enabled) | |
| ; Bit 20: SMEP (Supervisor Mode Execution Prevention) | |
| ; Bit 21: SMAP (Supervisor Mode Access Prevention) | |
| mov rax, cr4 | |
| or rax, 0x006006E0 ; Enable modern features | |
| mov cr4, rax</code></pre> | |
| <h4 id="model-specific-registers-msrs-1"><strong>Model-Specific | |
| Registers (MSRs)</strong></h4> | |
| <pre class="assembly"><code>; MSR Access via RDMSR/WRMSR | |
| msr_operations: | |
| ; IA32_EFER (0xC0000080) - Extended Features | |
| mov ecx, 0xC0000080 | |
| rdmsr ; Read into EDX:EAX | |
| or eax, 0x100 ; Set LME (Long Mode Enable) | |
| or eax, 0x800 ; Set NXE (No-Execute Enable) | |
| wrmsr | |
| ; IA32_STAR (0xC0000081) - SYSCALL target | |
| mov ecx, 0xC0000081 | |
| mov edx, 0x00230010 ; SYSRET/SYSCALL CS/SS | |
| mov eax, 0 ; Reserved | |
| wrmsr | |
| ; IA32_LSTAR (0xC0000082) - Long mode SYSCALL target | |
| mov ecx, 0xC0000082 | |
| mov rax, syscall_handler | |
| mov rdx, rax | |
| shr rdx, 32 | |
| wrmsr | |
| ; FS/GS Base MSRs | |
| mov ecx, 0xC0000100 ; IA32_FS_BASE | |
| mov rax, thread_local_storage | |
| mov rdx, rax | |
| shr rdx, 32 | |
| wrmsr | |
| mov ecx, 0xC0000101 ; IA32_GS_BASE | |
| wrmsr ; Set GS base | |
| mov ecx, 0xC0000102 ; IA32_KERNEL_GS_BASE | |
| wrmsr ; For SWAPGS</code></pre> | |
| <h4 id="task-state-segment-tss"><strong>Task State Segment | |
| (TSS)</strong></h4> | |
| <pre class="assembly"><code>; 64-bit TSS Structure (104 bytes minimum) | |
| struc TSS64 | |
| .reserved0 resd 1 ; Reserved | |
| .rsp0 resq 1 ; Ring 0 stack pointer | |
| .rsp1 resq 1 ; Ring 1 stack pointer | |
| .rsp2 resq 1 ; Ring 2 stack pointer | |
| .reserved1 resq 1 | |
| .ist1 resq 1 ; Interrupt Stack Table 1 | |
| .ist2 resq 1 ; IST 2 | |
| .ist3 resq 1 ; IST 3 | |
| .ist4 resq 1 ; IST 4 | |
| .ist5 resq 1 ; IST 5 | |
| .ist6 resq 1 ; IST 6 | |
| .ist7 resq 1 ; IST 7 | |
| .reserved2 resq 1 | |
| .reserved3 resw 1 | |
| .iopb_offset resw 1 ; I/O Permission Bitmap offset | |
| endstruc | |
| ; TSS Setup | |
| setup_tss: | |
| ; Initialize TSS | |
| mov rdi, tss_base | |
| xor rax, rax | |
| mov rcx, 104/8 | |
| rep stosq ; Clear TSS | |
| ; Set ring 0 stack | |
| mov rax, kernel_stack_top | |
| mov [tss_base + TSS64.rsp0], rax | |
| ; Set IST entries for critical interrupts | |
| mov rax, nmi_stack_top | |
| mov [tss_base + TSS64.ist1], rax | |
| mov rax, df_stack_top | |
| mov [tss_base + TSS64.ist2], rax | |
| ; Load TSS | |
| mov ax, tss_selector | |
| ltr ax</code></pre> | |
| <h3 id="interrupt-and-exception-handling"><strong>11.3 Interrupt and | |
| Exception Handling</strong></h3> | |
| <h4 id="interrupt-descriptor-table-management"><strong>Interrupt | |
| Descriptor Table Management</strong></h4> | |
| <pre class="assembly"><code>; IDT Gate Types | |
| ; 0x8E: Interrupt Gate (disables interrupts) | |
| ; 0x8F: Trap Gate (leaves interrupts enabled) | |
| ; DPL in bits 6-5 of type byte | |
| ; Create IDT entry | |
| create_idt_entry: | |
| ; Input: RDI = handler address, RSI = selector, RDX = type | |
| mov rax, rdi | |
| mov rbx, rdi | |
| shr rbx, 16 | |
| ; Entry structure: | |
| mov [idt_entry], ax ; Offset[15:0] | |
| mov [idt_entry+2], si ; Selector | |
| mov [idt_entry+4], dl ; Type and attributes | |
| mov [idt_entry+5], 0 ; IST | |
| mov [idt_entry+6], bx ; Offset[31:16] | |
| shr rax, 32 | |
| mov [idt_entry+8], eax ; Offset[63:32] | |
| mov [idt_entry+12], 0 ; Reserved | |
| ; Exception handlers with error codes | |
| exception_with_error_code: | |
| ; CPU pushes: SS, RSP, RFLAGS, CS, RIP, Error Code | |
| push rax ; Save registers | |
| push rcx | |
| push rdx | |
| push rbx | |
| push rbp | |
| push rsi | |
| push rdi | |
| push r8 | |
| push r9 | |
| push r10 | |
| push r11 | |
| mov rdi, [rsp + 88] ; Error code | |
| mov rsi, [rsp + 96] ; RIP | |
| call handle_page_fault | |
| pop r11 | |
| pop r10 | |
| pop r9 | |
| pop r8 | |
| pop rdi | |
| pop rsi | |
| pop rbp | |
| pop rbx | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| add rsp, 8 ; Remove error code | |
| iretq</code></pre> | |
| <h4 id="system-call-mechanisms"><strong>System Call | |
| Mechanisms</strong></h4> | |
| <pre class="assembly"><code>; SYSCALL/SYSRET (AMD64 fast system call) | |
| syscall_setup: | |
| ; Set SYSCALL entry point in LSTAR | |
| mov ecx, 0xC0000082 | |
| mov rax, syscall_entry | |
| mov rdx, rax | |
| shr rdx, 32 | |
| wrmsr | |
| ; Set SYSCALL/SYSRET CS/SS in STAR | |
| mov ecx, 0xC0000081 | |
| mov edx, 0x00230010 ; SYSRET CS/SS | SYSCALL CS/SS | |
| xor eax, eax | |
| wrmsr | |
| ; Set SYSCALL flags mask in SFMASK | |
| mov ecx, 0xC0000084 | |
| mov eax, 0x47700 ; Clear IF, TF, DF | |
| xor edx, edx | |
| wrmsr | |
| syscall_entry: | |
| ; SYSCALL: RCX=RIP, R11=RFLAGS | |
| ; RAX=syscall number, RDI/RSI/RDX/R10/R8/R9=args | |
| swapgs ; Switch to kernel GS | |
| mov [gs:saved_user_rsp], rsp | |
| mov rsp, [gs:kernel_stack] | |
| push rcx ; Save user RIP | |
| push r11 ; Save user RFLAGS | |
| ; Dispatch system call | |
| cmp rax, max_syscall | |
| ja invalid_syscall | |
| lea rbx, [syscall_table] | |
| call [rbx + rax*8] | |
| pop r11 ; Restore RFLAGS | |
| pop rcx ; Restore RIP | |
| mov rsp, [gs:saved_user_rsp] | |
| swapgs | |
| sysretq | |
| ; SYSENTER/SYSEXIT (Intel fast system call - legacy) | |
| sysenter_setup: | |
| ; Less commonly used in 64-bit mode | |
| mov ecx, 0x174 ; IA32_SYSENTER_CS | |
| mov eax, kernel_cs | |
| xor edx, edx | |
| wrmsr | |
| mov ecx, 0x175 ; IA32_SYSENTER_ESP | |
| mov rax, kernel_stack | |
| mov rdx, rax | |
| shr rdx, 32 | |
| wrmsr | |
| mov ecx, 0x176 ; IA32_SYSENTER_EIP | |
| mov rax, sysenter_entry | |
| mov rdx, rax | |
| shr rdx, 32 | |
| wrmsr</code></pre> | |
| <h3 id="memory-protection-mechanisms"><strong>11.4 Memory Protection | |
| Mechanisms</strong></h3> | |
| <h4 id="page-table-protection-attributes"><strong>Page Table Protection | |
| Attributes</strong></h4> | |
| <pre class="assembly"><code>; Page Table Entry (PTE) Protection Bits | |
| pte_protection_bits: | |
| ; Bit 0: P (Present) | |
| ; Bit 1: R/W (Read/Write) | |
| ; Bit 2: U/S (User/Supervisor) | |
| ; Bit 3: PWT (Write-Through) | |
| ; Bit 4: PCD (Cache Disable) | |
| ; Bit 5: A (Accessed) | |
| ; Bit 6: D (Dirty) | |
| ; Bit 7: PAT (Page Attribute Table) | |
| ; Bit 8: G (Global) | |
| ; Bit 63: NX (No-Execute) | |
| ; Set up protected page | |
| setup_protected_page: | |
| mov rax, physical_address | |
| or rax, 0x03 ; Present, R/W | |
| or rax, 0x8000000000000000 ; Set NX bit | |
| mov [pte_entry], rax | |
| ; SMEP (Supervisor Mode Execution Prevention) | |
| enable_smep: | |
| mov rax, cr4 | |
| or rax, (1 << 20) ; Set SMEP bit | |
| mov cr4, rax | |
| ; Now kernel cannot execute user pages | |
| ; SMAP (Supervisor Mode Access Prevention) | |
| enable_smap: | |
| mov rax, cr4 | |
| or rax, (1 << 21) ; Set SMAP bit | |
| mov cr4, rax | |
| ; Kernel needs STAC/CLAC to access user memory | |
| ; Accessing user memory with SMAP | |
| kernel_copy_from_user: | |
| stac ; Allow user access | |
| mov rax, [user_ptr</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p><strong>11.1 Privilege Levels and Rings</strong><br /> | |
| We start with the CPU’s hierarchical protection model—rings 0–3—with | |
| <code>CPL</code> taken from the CS selector and enforced for privileged | |
| instructions, control register writes, and I/O. The chapter lays out how | |
| segment descriptors encode DPL and type, covering code/data vs. system | |
| segments (TSS, LDT). Both GDT and IDT setup sequences are shown, | |
| including minimal 64‑bit GDT entries, selector values, and loading with | |
| <code>LGDT</code>/<code>LIDT</code>.</p> | |
| <p><strong>11.2 Control Registers and System Structures</strong><br /> | |
| We walk through CR0, CR3, and CR4 bitfields in detail (PE, PG, WP; PWT, | |
| PCD; PAE, SMEP, SMAP, OSXSAVE) and show how to enable long mode and | |
| paging. MSRs are programmed via <code>RDMSR</code>/<code>WRMSR</code>, | |
| including <code>IA32_EFER</code> for LME/NXE, | |
| <code>IA32_STAR</code>/<code>LSTAR</code> for SYSCALL, and FS/GS base | |
| for TLS. A full 64‑bit <code>TSS</code> layout with RSP0–RSP2, IST1–IST7 | |
| is given, along with clearing, stack pointer initialization, and loading | |
| via <code>LTR</code>.</p> | |
| <p><strong>11.3 Interrupt and Exception Handling</strong><br /> | |
| IDT entry format in 64‑bit mode is documented (Interrupt Gate vs. Trap | |
| Gate attributes) with a helper to fill entries given a handler, | |
| selector, and type. There’s a complete handler skeleton for exceptions | |
| with error codes, preserving registers, retrieving the error code and | |
| RIP, and returning with <code>IRETQ</code>. We cover both AMD64 | |
| SYSCALL/SYSRET fast paths — STAR/LSTAR/SFMASK setup — and Intel’s | |
| SYSENTER/SYSEXIT legacy setup.</p> | |
| <p><strong>11.4 Memory Protection Mechanisms</strong><br /> | |
| PTE bitfields are defined (Present, R/W, U/S, PAT, Global, NX) and | |
| examples given for creating a no‑execute page. We also walk through | |
| enabling Supervisor Mode Execution Prevention (SMEP) and Supervisor Mode | |
| Access Prevention (SMAP) via CR4, plus the use of | |
| <code>STAC</code>/<code>CLAC</code> when the kernel needs to touch user | |
| pointers.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>This chapter’s code samples are low‑level and complete enough to | |
| assemble for experimentation, bridging CPU architecture control with OS | |
| privilege enforcement. It’s the natural continuation after Ch.10’s | |
| EVEX/ZMM vector work, moving firmly into the kernel and protection | |
| space. If you’d like, I can now proceed to <strong>Chapter 12</strong> — | |
| which will cover <em>System Interrupt Routing, APIC, and SMP Boot</em> — | |
| so we can keep building toward a full 16‑chapter manual. Would you like | |
| me to go ahead?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-12-virtual-memory-and-paging-mechanisms"><strong>Chapter | |
| 12: Virtual Memory and Paging Mechanisms</strong></h2> | |
| <h3 id="x86-64-paging-architecture"><strong>12.1 x86-64 Paging | |
| Architecture</strong></h3> | |
| <h4 id="four-level-page-tables-pml4"><strong>Four-Level Page Tables | |
| (PML4)</strong></h4> | |
| <p>The x86-64 architecture uses a 4-level hierarchical paging structure | |
| to translate 48-bit virtual addresses:</p> | |
| <pre class="assembly"><code>; Virtual Address Layout (48-bit canonical) | |
| ; Bits 47-39: PML4 index (9 bits, 512 entries) | |
| ; Bits 38-30: PDPT index (9 bits, 512 entries) | |
| ; Bits 29-21: PD index (9 bits, 512 entries) | |
| ; Bits 20-12: PT index (9 bits, 512 entries) | |
| ; Bits 11-0: Page offset (12 bits, 4KB) | |
| ; Page Table Entry Format (8 bytes) | |
| PTE_PRESENT equ 0x001 ; Page present | |
| PTE_WRITABLE equ 0x002 ; Read/Write | |
| PTE_USER equ 0x004 ; User accessible | |
| PTE_PWT equ 0x008 ; Write-through | |
| PTE_PCD equ 0x010 ; Cache disable | |
| PTE_ACCESSED equ 0x020 ; Accessed flag | |
| PTE_DIRTY equ 0x040 ; Dirty flag | |
| PTE_LARGE equ 0x080 ; Large page (2MB/1GB) | |
| PTE_GLOBAL equ 0x100 ; Global page | |
| PTE_NX equ 0x8000000000000000 ; No-execute | |
| ; Create page table hierarchy | |
| create_page_tables: | |
| ; Allocate aligned pages | |
| mov rdi, pml4_base ; 4KB aligned | |
| xor rax, rax | |
| mov rcx, 512 | |
| rep stosq ; Clear PML4 | |
| ; Map first 2MB using 4KB pages | |
| mov rax, pdpt_base | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| mov [pml4_base], rax | |
| mov rax, pd_base | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| mov [pdpt_base], rax | |
| mov rax, pt_base | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| mov [pd_base], rax | |
| ; Fill page table with 4KB pages | |
| mov rcx, 512 | |
| xor rbx, rbx ; Physical address | |
| mov rdi, pt_base | |
| .fill_pt: | |
| mov rax, rbx | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| stosq | |
| add rbx, 0x1000 ; Next 4KB page | |
| loop .fill_pt | |
| ; Load page tables | |
| mov rax, pml4_base | |
| mov cr3, rax</code></pre> | |
| <h4 id="large-pages-2mb-and-1gb"><strong>Large Pages (2MB and | |
| 1GB)</strong></h4> | |
| <pre class="assembly"><code>; 2MB Large Pages (at PD level) | |
| setup_2mb_pages: | |
| ; PDPTE points to PD | |
| mov rax, pd_base | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| mov [pdpt_base], rax | |
| ; PD entries directly map 2MB pages | |
| mov rcx, 512 | |
| xor rbx, rbx | |
| mov rdi, pd_base | |
| .map_2mb: | |
| mov rax, rbx | |
| or rax, PTE_PRESENT | PTE_WRITABLE | PTE_LARGE | |
| stosq | |
| add rbx, 0x200000 ; Next 2MB | |
| loop .map_2mb | |
| ; 1GB Huge Pages (at PDPT level) | |
| setup_1gb_pages: | |
| ; Check CPU support | |
| mov eax, 0x80000001 | |
| cpuid | |
| test edx, (1 << 26) ; Check PDPE1GB bit | |
| jz .no_1gb_support | |
| ; PDPT entries directly map 1GB pages | |
| mov rcx, 4 ; Map first 4GB | |
| xor rbx, rbx | |
| mov rdi, pdpt_base | |
| .map_1gb: | |
| mov rax, rbx | |
| or rax, PTE_PRESENT | PTE_WRITABLE | PTE_LARGE | |
| stosq | |
| add rbx, 0x40000000 ; Next 1GB | |
| loop .map_1gb | |
| .no_1gb_support:</code></pre> | |
| <h4 id="five-level-paging-la57"><strong>Five-Level Paging | |
| (LA57)</strong></h4> | |
| <pre class="assembly"><code>; 57-bit Virtual Addressing with PML5 | |
| ; Bits 56-48: PML5 index (9 bits) | |
| ; Bits 47-39: PML4 index | |
| ; Bits 38-30: PDPT index | |
| ; Bits 29-21: PD index | |
| ; Bits 20-12: PT index | |
| ; Bits 11-0: Offset | |
| check_la57_support: | |
| mov eax, 7 | |
| xor ecx, ecx | |
| cpuid | |
| test ecx, (1 << 16) ; LA57 in ECX bit 16 | |
| jz .no_la57 | |
| ; Enable 5-level paging | |
| mov rax, cr4 | |
| or rax, (1 << 12) ; Set LA57 bit | |
| mov cr4, rax | |
| ; Set up PML5 table | |
| mov rdi, pml5_base | |
| xor rax, rax | |
| mov rcx, 512 | |
| rep stosq | |
| ; PML5[0] -> PML4 | |
| mov rax, pml4_base | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| mov [pml5_base], rax | |
| ; Load PML5 | |
| mov rax, pml5_base | |
| mov cr3, rax | |
| .no_la57:</code></pre> | |
| <h3 id="translation-lookaside-buffer-tlb-management"><strong>12.2 | |
| Translation Lookaside Buffer (TLB) Management</strong></h3> | |
| <h4 id="tlb-invalidation-techniques"><strong>TLB Invalidation | |
| Techniques</strong></h4> | |
| <pre class="assembly"><code>; Single page invalidation | |
| invalidate_page: | |
| ; Input: RDI = virtual address | |
| invlpg [rdi] ; Invalidate single TLB entry | |
| ; Full TLB flush via CR3 reload | |
| flush_tlb: | |
| mov rax, cr3 | |
| mov cr3, rax ; Reload CR3 flushes TLB | |
| ; Process Context ID (PCID) - preserves global pages | |
| pcid_operations: | |
| ; Check PCID support | |
| mov eax, 1 | |
| cpuid | |
| test ecx, (1 << 17) ; PCID bit | |
| jz .no_pcid | |
| ; Enable PCID | |
| mov rax, cr4 | |
| or rax, (1 << 17) ; Set PCIDE | |
| mov cr4, rax | |
| ; Use PCID in CR3 (bits 11:0) | |
| mov rax, pml4_base | |
| or rax, 0x001 ; PCID = 1 | |
| mov cr3, rax | |
| ; INVPCID instruction for targeted flush | |
| ; Type 0: Individual address | |
| ; Type 1: Single PCID | |
| ; Type 2: All including globals | |
| ; Type 3: All non-globals | |
| mov rax, 1 ; Type: single PCID | |
| mov rcx, pcid_descriptor ; 128-bit descriptor | |
| invpcid rax, [rcx] | |
| .no_pcid: | |
| ; Global page optimization | |
| mark_global_pages: | |
| ; Set G bit for kernel pages | |
| mov rax, [kernel_pte] | |
| or rax, PTE_GLOBAL | |
| mov [kernel_pte], rax | |
| ; Global pages survive CR3 reload (unless CR4.PGE cleared)</code></pre> | |
| <h4 id="page-attribute-table-pat"><strong>Page Attribute Table | |
| (PAT)</strong></h4> | |
| <pre class="assembly"><code>; PAT MSR Configuration (0x277) | |
| setup_pat: | |
| mov ecx, 0x277 | |
| rdmsr | |
| ; Default PAT values: | |
| ; PAT0: WB (Write-Back) | |
| ; PAT1: WT (Write-Through) | |
| ; PAT2: UC- (Uncached minus) | |
| ; PAT3: UC (Uncached) | |
| ; PAT4: WB | |
| ; PAT5: WT | |
| ; PAT6: UC- | |
| ; PAT7: UC | |
| ; Modify for custom caching | |
| ; Bits 2:0 = PAT0, 10:8 = PAT1, etc. | |
| mov eax, 0x0007040600070406 | |
| mov edx, 0x0007040600070406 | |
| wrmsr | |
| ; Use PAT in page table entry | |
| set_page_caching: | |
| ; PAT index = PTE bits: PAT(7) | PCD(4) | PWT(3) | |
| mov rax, physical_addr | |
| or rax, PTE_PRESENT | PTE_WRITABLE | |
| or rax, 0x08 ; PWT=1 -> PAT index 1 (WT) | |
| mov [pte_entry], rax</code></pre> | |
| <h3 id="memory-protection-extensions"><strong>12.3 Memory Protection | |
| Extensions</strong></h3> | |
| <h4 id="nx-bit-and-dep"><strong>NX Bit and DEP</strong></h4> | |
| <pre class="assembly"><code>; Enable NX (No-Execute) bit support | |
| enable_nx: | |
| ; Check NX support | |
| mov eax, 0x80000001 | |
| cpuid | |
| test edx, (1 << 20) ; NX bit | |
| jz .no_nx | |
| ; Enable in EFER MSR | |
| mov ecx, 0xC0000080 ; IA32_EFER | |
| rdmsr | |
| or eax, (1 << 11) ; Set NXE | |
| wrmsr | |
| ; Mark data pages as non-executable | |
| mov rax, [data_pte] | |
| or rax, PTE_NX ; Set bit 63 | |
| mov [data_pte], rax | |
| .no_nx: | |
| ; Protection Keys (PKU) | |
| setup_protection_keys: | |
| ; Check PKU support | |
| mov eax, 7 | |
| xor ecx, ecx | |
| cpuid | |
| test ecx, (1 << 3) ; PKU bit | |
| jz .no_pku | |
| ; Enable in CR4 | |
| mov rax, cr4 | |
| or rax, (1 << 22) ; Set PKE | |
| mov cr4, rax | |
| ; Set protection key in PTE (bits 62:59) | |
| mov rax, [user_pte] | |
| and rax, ~(0xF << 59) ; Clear key bits | |
| or rax, (2 << 59) ; Set key = 2 | |
| mov [user_pte], rax | |
| ; Configure PKRU register | |
| xor ecx, ecx ; PKRU index 0 | |
| mov eax, 0xFFFFFFFC ; Disable access to key 2 | |
| xor edx, edx | |
| wrpkru | |
| .no_pku:</code></pre> | |
| <h4 id="memory-type-range-registers-mtrrs"><strong>Memory Type Range | |
| Registers (MTRRs)</strong></h4> | |
| <pre class="assembly"><code>; MTRR Configuration | |
| configure_mtrrs: | |
| ; Disable MTRRs during setup | |
| mov ecx, 0x2FF ; IA32_MTRR_DEF_TYPE | |
| rdmsr | |
| and eax, ~(1 << 11) ; Clear E bit | |
| wrmsr | |
| ; Set variable MTRR for framebuffer | |
| mov ecx, 0x200 ; IA32_MTRR_PHYSBASE0 | |
| mov rax, 0xF0000000 ; Physical base | |
| or rax, 0x01 ; Type = WC (Write-Combining) | |
| xor rdx, rdx | |
| wrmsr | |
| mov ecx, 0x201 ; IA32_MTRR_PHYSMASK0 | |
| mov rax, 0xFFF00000 ; 1MB size | |
| or rax, (1 << 11) ; Valid bit | |
| mov rdx, 0x0F ; High bits of mask | |
| wrmsr | |
| ; Enable MTRRs | |
| mov ecx, 0x2FF | |
| rdmsr | |
| or eax, (1 << 11) ; Set E bit | |
| or eax, (1 << 10) ; Set FE (Fixed MTRRs) | |
| wrmsr</code></pre> | |
| <h3 id="virtual-memory-operations"><strong>12.4 Virtual Memory | |
| Operations</strong></h3> | |
| <h4 id="page-fault-handling"><strong>Page Fault Handling</strong></h4> | |
| <pre class="assembly"><code>; Page Fault Handler (Exception 14) | |
| page_fault_handler: | |
| push rax | |
| push rcx | |
| push rdx | |
| push rbx | |
| push rbp | |
| push rsi | |
| push rdi | |
| push r8 | |
| push r9 | |
| push r10 | |
| push r11 | |
| ; Get fault address from CR2 | |
| mov rdi, cr2 | |
| ; Get error code (on stack) | |
| mov rsi, [rsp + 88] | |
| ; Analyze error code | |
| test rsi, 0x01 ; Present bit | |
| jz .not_present | |
| test rsi, 0x02 ; Write access | |
| jnz .write_fault | |
| test rsi, 0x04 ; User mode | |
| jnz .user_fault | |
| test rsi, 0x10 ; Instruction fetch | |
| jnz .exec_fault | |
| .not_present: | |
| ; Handle demand paging | |
| call allocate_page | |
| mov rbx, rax ; Physical page | |
| ; Calculate PTE address | |
| mov rax, rdi ; Fault address | |
| shr rax, 12 ; Page number | |
| and rax, 0x1FF ; PT index | |
| shl rax, 3 ; *8 for entry size | |
| add rax, pt_base | |
| ; Install PTE | |
| mov rdx, rbx | |
| or rdx, PTE_PRESENT | PTE_WRITABLE | PTE_USER | |
| mov [rax], rdx | |
| ; Invalidate TLB | |
| invlpg [rdi] | |
| jmp .done | |
| .write_fault: | |
| ; Handle copy-on-write | |
| call handle_cow | |
| jmp .done | |
| .user_fault: | |
| ; Check user permissions | |
| call check_user_access | |
| jmp .done | |
| .exec_fault: | |
| ; Check NX violation | |
| call handle_nx_violation | |
| .done: | |
| pop r11 | |
| pop r10 | |
| pop r9 | |
| pop r8 | |
| pop rdi | |
| pop rsi | |
| pop rbp | |
| pop rbx | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| add rsp, 8 ; Remove error code | |
| iretq</code></pre> | |
| <h4 id="memory-mapping-and-unmapping"><strong>Memory Mapping and | |
| Unmapping</strong></h4> | |
| <pre class="assembly"><code>; Map virtual to physical address | |
| map_page: | |
| ; Input: RDI = virtual, RSI = physical, RDX = flags | |
| push rbx | |
| push rcx | |
| ; Walk page tables | |
| mov rax, rdi | |
| shr rax, 39 | |
| and rax, 0x1FF ; PML4 index | |
| shl rax, 3 | |
| add rax, pml4_base | |
| ; Check PML4E | |
| mov rbx, [rax] | |
| test rbx, PTE_PRESENT | |
| jnz .pdpt_exists | |
| ; Allocate PDPT | |
| call allocate_page | |
| or rax, PTE_PRESENT | PTE_WRITABLE | PTE_USER | |
| mov [rax], rax | |
| mov rbx, rax | |
| .pdpt_exists: | |
| and rbx, ~0xFFF ; Clear flags | |
| mov rax, rdi | |
| shr rax, 30 | |
| and rax, 0x1FF ; PDPT index | |
| shl rax, 3 | |
| add rax, rbx | |
| ; Continue for PD and PT... | |
| ; Install final PTE | |
| mov rax, rsi ; Physical address | |
| or rax, rdx ; Flags | |
| mov [final_pte], rax | |
| invlpg [rdi] ; Flush TLB | |
| pop rcx | |
| pop rbx | |
| ret | |
| ; Unmap pages | |
| unmap_pages: | |
| ; Input: RDI = start address, RSI = page count | |
| push rcx | |
| mov rcx, rsi | |
| .unmap_loop: | |
| ; Clear PTE | |
| call get_pte_address | |
| mov qword [rax], 0 | |
| invlpg [rdi] | |
| add rdi, 0x1000 | |
| loop .unmap_loop | |
| pop rcx | |
| ret</code></pre> | |
| <h4 id="copy-on-write-implementation"><strong>Copy-on-Write | |
| Implementation</strong></h4> | |
| <pre class="assembly"><code>; Mark pages for COW | |
| setup_cow: | |
| ; Input: RDI = start, RSI = end | |
| push rax | |
| push rbx | |
| .cow_loop: | |
| call get_pte_address | |
| mov rbx, [rax] | |
| and rbx, ~PTE_WRITABLE ; Clear write bit | |
| or rbx, (1 << 9) ; Set available bit for COW | |
| mov [rax], rbx | |
| add rdi, 0x1000 | |
| cmp rdi, rsi | |
| jb .cow_loop | |
| pop rbx | |
| pop rax | |
| ret | |
| ; Handle COW fault | |
| handle_cow: | |
| ; Input: RDI = fault address | |
| push rax | |
| push rbx | |
| push rcx | |
| push rdx | |
| ; Get PTE | |
| call get_pte_address | |
| mov rbx, [rax] | |
| ; Check if COW page | |
| test rbx, (1 << 9) ; COW bit | |
| jz .not_cow | |
| ; Allocate new page | |
| call allocate_page | |
| mov rdx, rax ; New physical page | |
| ; Copy old page content | |
| mov rsi, rbx | |
| and rsi, ~0xFFF ; Old physical address | |
| mov rdi, rdx | |
| mov rcx, 512 ; 4KB / 8 | |
| rep movsq | |
| ; Update PTE | |
| call get_pte_address | |
| mov rbx, rdx | |
| or rbx, PTE_PRESENT | PTE_WRITABLE | PTE_USER | |
| and rbx, ~(1 << 9) ; Clear COW bit | |
| mov [rax], rbx | |
| invlpg [rdi] | |
| .not_cow: | |
| pop rdx | |
| pop rcx | |
| pop rbx | |
| pop rax | |
| ret</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>This chapter provides comprehensive coverage of x86-64’s virtual | |
| memory architecture, from the 4-level page table hierarchy through TLB | |
| management, protection mechanisms, and practical implementations of page | |
| fault handling and COW. The code examples are detailed enough to | |
| understand the low-level mechanics while being practical for real | |
| implementation.</p> | |
| <p>Would you like me to continue with <strong>Chapter 13</strong>, which | |
| could cover <em>Advanced Synchronization and Atomic Operations</em>?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-13-interrupts-apic-and-multi-core-programming"><strong>Chapter | |
| 13: Interrupts, APIC, and Multi-Core Programming</strong></h2> | |
| <h3 id="interrupt-architecture"><strong>13.1 Interrupt | |
| Architecture</strong></h3> | |
| <h4 id="interrupt-descriptor-table-idt"><strong>Interrupt Descriptor | |
| Table (IDT)</strong></h4> | |
| <p>The IDT contains up to 256 gate descriptors that specify handlers for | |
| interrupts and exceptions:</p> | |
| <pre class="assembly"><code>; IDT Gate Descriptor (16 bytes in 64-bit mode) | |
| struc idt_gate | |
| .offset_low resw 1 ; Offset bits 0-15 | |
| .selector resw 1 ; Code segment selector | |
| .ist resb 1 ; Interrupt Stack Table (bits 0-2) | |
| .attributes resb 1 ; Type and attributes | |
| .offset_mid resw 1 ; Offset bits 16-31 | |
| .offset_high resd 1 ; Offset bits 32-63 | |
| .reserved resd 1 ; Reserved | |
| endstruc | |
| ; Gate types | |
| GATE_INTERRUPT equ 0x8E ; Interrupt gate (IF cleared) | |
| GATE_TRAP equ 0x8F ; Trap gate (IF unchanged) | |
| GATE_CALL equ 0x8C ; Call gate (not for interrupts) | |
| ; Build IDT | |
| build_idt: | |
| mov rdi, idt_base | |
| mov rcx, 256 | |
| .fill_idt: | |
| ; Default handler for all vectors | |
| mov rax, default_handler | |
| mov [rdi + idt_gate.offset_low], ax | |
| shr rax, 16 | |
| mov [rdi + idt_gate.offset_mid], ax | |
| shr rax, 16 | |
| mov [rdi + idt_gate.offset_high], eax | |
| mov word [rdi + idt_gate.selector], 0x08 ; Kernel CS | |
| mov byte [rdi + idt_gate.ist], 0 ; No IST | |
| mov byte [rdi + idt_gate.attributes], GATE_INTERRUPT | |
| mov dword [rdi + idt_gate.reserved], 0 | |
| add rdi, 16 | |
| loop .fill_idt | |
| ; Set specific handlers | |
| mov rsi, exception_handlers | |
| xor rcx, rcx | |
| .set_exceptions: | |
| mov rax, [rsi + rcx*8] | |
| test rax, rax | |
| jz .skip | |
| ; Calculate IDT entry address | |
| mov rdi, idt_base | |
| shl rcx, 4 ; *16 for entry size | |
| add rdi, rcx | |
| shr rcx, 4 | |
| ; Install handler | |
| mov [rdi + idt_gate.offset_low], ax | |
| shr rax, 16 | |
| mov [rdi + idt_gate.offset_mid], ax | |
| shr rax, 16 | |
| mov [rdi + idt_gate.offset_high], eax | |
| .skip: | |
| inc rcx | |
| cmp rcx, 32 ; First 32 are exceptions | |
| jb .set_exceptions | |
| ; Load IDT | |
| lidt [idt_descriptor] | |
| ret | |
| idt_descriptor: | |
| dw idt_size - 1 ; Limit | |
| dq idt_base ; Base address</code></pre> | |
| <h4 id="exception-handling-1"><strong>Exception Handling</strong></h4> | |
| <pre class="assembly"><code>; Exception handlers with error codes | |
| ; Stack frame on entry: | |
| ; [RSP+40] SS | |
| ; [RSP+32] RSP | |
| ; [RSP+24] RFLAGS | |
| ; [RSP+16] CS | |
| ; [RSP+8] RIP | |
| ; [RSP] Error code (for some exceptions) | |
| ; Page Fault Handler (#PF, vector 14) | |
| page_fault_handler: | |
| ; Error code already on stack | |
| push rax | |
| push rcx | |
| push rdx | |
| push rbx | |
| push rbp | |
| push rsi | |
| push rdi | |
| push r8 | |
| push r9 | |
| push r10 | |
| push r11 | |
| push r12 | |
| push r13 | |
| push r14 | |
| push r15 | |
| mov rdi, cr2 ; Faulting address | |
| mov rsi, [rsp + 120] ; Error code | |
| mov rdx, [rsp + 128] ; RIP | |
| ; Check error code bits | |
| test rsi, 0x01 ; Present | |
| jz .not_present | |
| test rsi, 0x02 ; Write | |
| jnz .write_violation | |
| test rsi, 0x04 ; User mode | |
| jnz .user_violation | |
| test rsi, 0x08 ; Reserved bit set | |
| jnz .reserved_violation | |
| test rsi, 0x10 ; Instruction fetch | |
| jnz .nx_violation | |
| .not_present: | |
| call handle_page_not_present | |
| jmp .done | |
| .write_violation: | |
| call handle_write_protection | |
| jmp .done | |
| .user_violation: | |
| call handle_user_access | |
| jmp .done | |
| .nx_violation: | |
| call handle_nx_fault | |
| .done: | |
| pop r15 | |
| pop r14 | |
| pop r13 | |
| pop r12 | |
| pop r11 | |
| pop r10 | |
| pop r9 | |
| pop r8 | |
| pop rdi | |
| pop rsi | |
| pop rbp | |
| pop rbx | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| add rsp, 8 ; Remove error code | |
| iretq | |
| ; General Protection Fault (#GP, vector 13) | |
| general_protection_handler: | |
| ; Has error code | |
| push rax | |
| push rcx | |
| push rdx | |
| mov rax, [rsp + 24] ; Error code | |
| test rax, 0x01 ; External event | |
| jnz .external | |
| ; Decode selector index | |
| and rax, 0xFFF8 ; Selector index | |
| shr rax, 3 | |
| ; Check if GDT or LDT | |
| test dword [rsp + 24], 0x04 ; TI bit | |
| jnz .ldt_error | |
| ; Handle GDT selector error | |
| call handle_gdt_error | |
| jmp .done | |
| .ldt_error: | |
| call handle_ldt_error | |
| jmp .done | |
| .external: | |
| call handle_external_gp | |
| .done: | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| add rsp, 8 ; Remove error code | |
| iretq | |
| ; Double Fault (#DF, vector 8) | |
| ; Uses IST to ensure valid stack | |
| double_fault_handler: | |
| ; Critical error - system is likely corrupted | |
| cli | |
| ; Save minimal state | |
| push rax | |
| push rcx | |
| push rdx | |
| ; Log error | |
| mov rdi, double_fault_msg | |
| call panic_print | |
| ; Attempt to get fault information | |
| mov rax, [rsp + 24] ; RIP | |
| mov rcx, [rsp + 32] ; CS | |
| mov rdx, [rsp + 40] ; RFLAGS | |
| ; System halt | |
| hlt | |
| jmp $</code></pre> | |
| <h4 id="hardware-vs-software-interrupts"><strong>Hardware vs Software | |
| Interrupts</strong></h4> | |
| <pre class="assembly"><code>; Software interrupt handling | |
| ; INT instruction pushes: FLAGS, CS, RIP | |
| ; No error code pushed | |
| ; System call via INT 0x80 (legacy) | |
| int80_handler: | |
| ; Save all registers (System V ABI) | |
| push rax | |
| push rcx | |
| push rdx | |
| push rbx | |
| push rbp | |
| push rsi | |
| push rdi | |
| push r8 | |
| push r9 | |
| push r10 | |
| push r11 | |
| ; System call number in RAX | |
| cmp rax, MAX_SYSCALL | |
| ja .invalid | |
| ; Call handler | |
| mov rcx, syscall_table | |
| call [rcx + rax*8] | |
| ; Result in RAX | |
| mov [rsp + 80], rax ; Store in saved RAX | |
| .done: | |
| pop r11 | |
| pop r10 | |
| pop r9 | |
| pop r8 | |
| pop rdi | |
| pop rsi | |
| pop rbp | |
| pop rbx | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| iretq | |
| .invalid: | |
| mov rax, -ENOSYS | |
| jmp .done | |
| ; Hardware interrupt from APIC | |
| apic_timer_handler: | |
| push rax | |
| push rcx | |
| push rdx | |
| ; Increment tick counter | |
| lock inc qword [system_ticks] | |
| ; Check for scheduler quantum | |
| mov rax, [current_task] | |
| dec dword [rax + task.quantum] | |
| jnz .no_schedule | |
| ; Need reschedule | |
| call schedule | |
| .no_schedule: | |
| ; Send EOI to APIC | |
| mov rax, [apic_base] | |
| mov dword [rax + APIC_EOI], 0 | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| iretq</code></pre> | |
| <h3 id="advanced-programmable-interrupt-controller-apic"><strong>13.2 | |
| Advanced Programmable Interrupt Controller (APIC)</strong></h3> | |
| <h4 id="local-apic-programming"><strong>Local APIC | |
| Programming</strong></h4> | |
| <pre class="assembly"><code>; APIC Register Offsets | |
| APIC_ID equ 0x020 ; APIC ID | |
| APIC_VERSION equ 0x030 ; Version | |
| APIC_TPR equ 0x080 ; Task Priority | |
| APIC_EOI equ 0x0B0 ; End of Interrupt | |
| APIC_LDR equ 0x0D0 ; Logical Destination | |
| APIC_DFR equ 0x0E0 ; Destination Format | |
| APIC_SPURIOUS equ 0x0F0 ; Spurious Interrupt Vector | |
| APIC_ISR equ 0x100 ; In-Service (8 registers) | |
| APIC_TMR equ 0x180 ; Trigger Mode (8 registers) | |
| APIC_IRR equ 0x200 ; Interrupt Request (8 registers) | |
| APIC_ESR equ 0x280 ; Error Status | |
| APIC_ICR_LOW equ 0x300 ; Interrupt Command (low) | |
| APIC_ICR_HIGH equ 0x310 ; Interrupt Command (high) | |
| APIC_TIMER_LVT equ 0x320 ; Timer Local Vector Table | |
| APIC_THERMAL_LVT equ 0x330 ; Thermal LVT | |
| APIC_PERF_LVT equ 0x340 ; Performance Counter LVT | |
| APIC_LINT0_LVT equ 0x350 ; LINT0 LVT | |
| APIC_LINT1_LVT equ 0x360 ; LINT1 LVT | |
| APIC_ERROR_LVT equ 0x370 ; Error LVT | |
| APIC_TIMER_INIT equ 0x380 ; Timer Initial Count | |
| APIC_TIMER_CURR equ 0x390 ; Timer Current Count | |
| APIC_TIMER_DIV equ 0x3E0 ; Timer Divide Configuration | |
| ; Initialize Local APIC | |
| init_local_apic: | |
| ; Get APIC base from MSR | |
| mov ecx, 0x1B ; IA32_APIC_BASE MSR | |
| rdmsr | |
| and eax, 0xFFFFF000 ; Clear lower 12 bits | |
| mov [apic_base], rax | |
| ; Enable APIC (set bit 11) | |
| or eax, 0x800 | |
| wrmsr | |
| ; Map APIC to virtual memory (if needed) | |
| mov rdi, APIC_VIRT_BASE | |
| mov rsi, rax ; Physical address | |
| mov rdx, PAGE_SIZE | |
| mov rcx, PAGE_PRESENT | PAGE_WRITE | PAGE_NO_CACHE | |
| call map_pages | |
| ; Set up spurious interrupt vector | |
| mov rax, [apic_base] | |
| mov dword [rax + APIC_SPURIOUS], 0x1FF ; Enable APIC, vector 0xFF | |
| ; Set task priority to accept all interrupts | |
| mov dword [rax + APIC_TPR], 0 | |
| ; Configure Local Vector Table entries | |
| ; Timer (periodic mode, vector 0x20) | |
| mov dword [rax + APIC_TIMER_LVT], 0x20020 | |
| ; Error interrupt (vector 0x21) | |
| mov dword [rax + APIC_ERROR_LVT], 0x21 | |
| ; Performance counter (vector 0x22) | |
| mov dword [rax + APIC_PERF_LVT], 0x22 | |
| ; Thermal sensor (vector 0x23) | |
| mov dword [rax + APIC_THERMAL_LVT], 0x23 | |
| ; LINT0 and LINT1 (masked) | |
| mov dword [rax + APIC_LINT0_LVT], 0x10000 | |
| mov dword [rax + APIC_LINT1_LVT], 0x10000 | |
| ret | |
| ; APIC Timer Programming | |
| setup_apic_timer: | |
| ; Input: RDI = frequency in Hz | |
| push rax | |
| push rcx | |
| push rdx | |
| mov rax, [apic_base] | |
| ; Set divide value to 16 | |
| mov dword [rax + APIC_TIMER_DIV], 0x03 | |
| ; Calculate initial count | |
| ; count = (bus_frequency / 16) / desired_frequency | |
| mov rcx, [apic_bus_frequency] | |
| shr rcx, 4 ; Divide by 16 | |
| xor rdx, rdx | |
| div rdi ; Divide by desired frequency | |
| ; Set initial count | |
| mov [rax + APIC_TIMER_INIT], eax | |
| ; Start timer in periodic mode | |
| mov dword [rax + APIC_TIMER_LVT], 0x20020 | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| ret</code></pre> | |
| <h4 id="io-apic-configuration"><strong>I/O APIC | |
| Configuration</strong></h4> | |
| <pre class="assembly"><code>; I/O APIC Registers | |
| IOAPIC_ID equ 0x00 | |
| IOAPIC_VERSION equ 0x01 | |
| IOAPIC_ARB equ 0x02 | |
| IOAPIC_REDTBL equ 0x10 ; Redirection table base | |
| ; I/O APIC indirect access | |
| ioapic_read: | |
| ; Input: RDI = register | |
| ; Output: RAX = value | |
| mov rax, [ioapic_base] | |
| mov dword [rax], edi ; Select register | |
| mov eax, dword [rax + 0x10] ; Read data | |
| ret | |
| ioapic_write: | |
| ; Input: RDI = register, RSI = value | |
| mov rax, [ioapic_base] | |
| mov dword [rax], edi ; Select register | |
| mov dword [rax + 0x10], esi ; Write data | |
| ret | |
| ; Configure I/O APIC redirection entry | |
| setup_ioapic_entry: | |
| ; Input: RDI = IRQ, RSI = vector, RDX = destination APIC ID | |
| push rax | |
| push rcx | |
| ; Calculate redirection table register | |
| shl rdi, 1 ; Each entry is 2 registers | |
| add rdi, IOAPIC_REDTBL | |
| ; Low 32 bits: vector, delivery mode, destination mode | |
| mov rax, rsi ; Vector | |
| or rax, (0 << 8) ; Fixed delivery | |
| or rax, (0 << 11) ; Physical destination | |
| or rax, (0 << 13) ; Active high | |
| or rax, (0 << 15) ; Edge triggered | |
| or rax, (0 << 16) ; Unmask interrupt | |
| push rdi | |
| mov rsi, rax | |
| call ioapic_write </code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 | |
| id="chapter-14-security-extensions-and-virtualization"><strong>Chapter | |
| 14: Security Extensions and Virtualization</strong></h2> | |
| <h3 id="hardware-assisted-security-features"><strong>14.1 | |
| Hardware-Assisted Security Features</strong></h3> | |
| <h4 id="nx-bit-no-execute"><strong>NX Bit (No-Execute)</strong></h4> | |
| <p>The NX bit, enabled via <code>IA32_EFER.NXE</code> (MSR | |
| <code>0xC0000080</code>), prevents code execution from data pages. When | |
| set in a Page Table Entry (PTE), it marks the page as | |
| non-executable.</p> | |
| <pre class="assembly"><code>; Enable NX support | |
| enable_nx: | |
| mov ecx, 0xC0000080 ; IA32_EFER MSR | |
| rdmsr | |
| or eax, (1 << 11) ; Set NXE bit | |
| wrmsr | |
| ; Mark data pages non-executable in page tables | |
| mov rax, [data_page_pte_addr] | |
| or rax, (1 << 63) ; Set bit 63 (NX bit) in PTE | |
| mov [data_page_pte_addr], rax | |
| ret | |
| ; Page fault handler for NX violations | |
| nx_fault_handler: | |
| ; CR2 contains faulting address | |
| mov rdi, cr2 | |
| ; Error code bit 4 indicates instruction fetch | |
| mov rsi, [rsp] ; Error code on stack | |
| test rsi, 0x10 | |
| jz .not_nx_fault | |
| ; Handle NX violation | |
| call log_nx_violation | |
| ; Terminate offending process | |
| call terminate_current_process | |
| .not_nx_fault: | |
| ret</code></pre> | |
| <h4 | |
| id="smapsmep-supervisor-mode-accessexecution-prevention"><strong>SMAP/SMEP | |
| (Supervisor Mode Access/Execution Prevention)</strong></h4> | |
| <p>These features prevent kernel exploitation by blocking access to | |
| user-space memory from kernel mode.</p> | |
| <pre class="assembly"><code>; Enable SMAP and SMEP | |
| enable_smap_smep: | |
| ; Check CPU support via CPUID | |
| mov eax, 7 | |
| xor ecx, ecx | |
| cpuid | |
| ; Check SMAP (bit 20) and SMEP (bit 7) support | |
| test ebx, (1 << 20) | |
| jz .no_smap | |
| test ebx, (1 << 7) | |
| jz .no_smep | |
| ; Enable in CR4 | |
| mov rax, cr4 | |
| or rax, (1 << 21) ; SMAP bit | |
| or rax, (1 << 20) ; SMEP bit | |
| mov cr4, rax | |
| .no_smap: | |
| .no_smep: | |
| ret | |
| ; Safe user memory access with SMAP | |
| copy_from_user: | |
| ; Input: RDI = kernel buffer, RSI = user buffer, RDX = size | |
| ; Returns: RAX = 0 on success, -EFAULT on failure | |
| ; Temporarily allow user access | |
| stac ; Set AC flag | |
| ; Verify user address range | |
| mov rax, USER_SPACE_END | |
| sub rax, rsi | |
| cmp rax, rdx | |
| jb .fault | |
| ; Copy with exception handling | |
| cld | |
| mov rcx, rdx | |
| rep movsb | |
| ; Disable user access | |
| clac ; Clear AC flag | |
| xor rax, rax ; Success | |
| ret | |
| .fault: | |
| clac | |
| mov rax, -EFAULT | |
| ret</code></pre> | |
| <h4 id="intel-cet-control-flow-enforcement-technology"><strong>Intel CET | |
| (Control-flow Enforcement Technology)</strong></h4> | |
| <p>CET provides hardware-based protection against ROP/JOP attacks | |
| through shadow stacks and indirect branch tracking.</p> | |
| <pre class="assembly"><code>; Shadow Stack Setup | |
| setup_shadow_stack: | |
| ; Check CET support | |
| mov eax, 7 | |
| xor ecx, ecx | |
| cpuid | |
| test ecx, (1 << 7) ; CET_SS bit | |
| jz .no_cet | |
| ; Enable CET in CR4 | |
| mov rax, cr4 | |
| or rax, (1 << 23) ; CET bit | |
| mov cr4, rax | |
| ; Allocate shadow stack | |
| mov rdi, SHADOW_STACK_SIZE | |
| call allocate_shadow_stack_memory | |
| ; Load shadow stack pointer | |
| mov ecx, 0x6A4 ; IA32_PL3_SSP MSR | |
| mov rdx, rax | |
| shr rdx, 32 | |
| wrmsr | |
| ; Enable shadow stack in IA32_U_CET | |
| mov ecx, 0x6A0 ; IA32_U_CET MSR | |
| rdmsr | |
| or eax, 0x01 ; SH_STK_EN | |
| wrmsr | |
| .no_cet: | |
| ret | |
| ; Indirect Branch Tracking | |
| ; Valid indirect targets must begin with ENDBRANCH | |
| valid_indirect_target: | |
| endbr64 ; Mark as valid branch target | |
| ; Function code follows... | |
| ret | |
| ; CET-aware exception handler | |
| cet_exception_handler: | |
| ; Check if CET exception | |
| mov rax, [rsp] ; Error code | |
| test rax, CET_FAULT_FLAG | |
| jz .not_cet | |
| ; Log CET violation | |
| mov rdi, [rsp + 8] ; RIP of violation | |
| call log_cet_violation | |
| ; Terminate process | |
| call terminate_current_process | |
| .not_cet: | |
| ret</code></pre> | |
| <h4 id="intel-sgx-software-guard-extensions"><strong>Intel SGX (Software | |
| Guard Extensions)</strong></h4> | |
| <p>SGX creates secure enclaves for protecting sensitive code and | |
| data.</p> | |
| <pre class="assembly"><code>; Check SGX support | |
| check_sgx_support: | |
| mov eax, 7 | |
| xor ecx, ecx | |
| cpuid | |
| test ebx, (1 << 2) ; SGX bit | |
| jz .no_sgx | |
| ; Check SGX leaf functions | |
| mov eax, 0x12 | |
| xor ecx, ecx | |
| cpuid | |
| ; EAX contains SGX1 support flags | |
| ; EBX contains MISCSELECT | |
| ; EDX contains maximum enclave size | |
| .no_sgx: | |
| ret | |
| ; Enclave creation flow (simplified) | |
| create_enclave: | |
| ; 1. Reserve memory region for enclave | |
| mov rdi, enclave_size | |
| call reserve_memory_region | |
| mov [enclave_base], rax | |
| ; 2. Create SECS (SGX Enclave Control Structure) | |
| lea rdi, [secs_page] | |
| mov rsi, [enclave_base] | |
| encls ; ECREATE instruction | |
| ; 3. Add pages to enclave | |
| mov rcx, [enclave_pages] | |
| .add_pages: | |
| push rcx | |
| ; Add regular page | |
| lea rdi, [page_info] | |
| encls ; EADD instruction | |
| ; Extend measurement | |
| lea rdi, [page_info] | |
| encls ; EEXTEND instruction | |
| pop rcx | |
| loop .add_pages | |
| ; 4. Initialize enclave | |
| lea rdi, [sigstruct] | |
| lea rsi, [secs_page] | |
| lea rdx, [einittoken] | |
| encls ; EINIT instruction | |
| ret | |
| ; Enter enclave | |
| enter_enclave: | |
| ; Save state | |
| push rbx | |
| push rcx | |
| ; Set up parameters | |
| lea rbx, [tcs_page] ; Thread Control Structure | |
| lea rcx, [aep_handler] ; Asynchronous Exit Pointer | |
| ; Enter enclave | |
| enclu ; EENTER instruction | |
| ; Returns here after enclave exit | |
| pop rcx | |
| pop rbx | |
| ret</code></pre> | |
| <h3 id="virtualization-architecture"><strong>14.2 Virtualization | |
| Architecture</strong></h3> | |
| <h4 id="intel-vt-x-vmx-fundamentals"><strong>Intel VT-x (VMX) | |
| Fundamentals</strong></h4> | |
| <pre class="assembly"><code>; VMX capability checking | |
| check_vmx_capability: | |
| ; Check CPUID for VMX support | |
| mov eax, 1 | |
| cpuid | |
| test ecx, (1 << 5) ; VMX bit | |
| jz .no_vmx | |
| ; Read VMX capability MSRs | |
| mov ecx, 0x480 ; IA32_VMX_BASIC | |
| rdmsr | |
| ; EAX[30:0] = VMCS revision ID | |
| ; EAX[48:32] = VMCS region size | |
| ; EDX = VMX capabilities | |
| mov [vmcs_revision_id], eax | |
| shr rax, 32 | |
| and rax, 0x1FFF | |
| mov [vmcs_region_size], rax | |
| .no_vmx: | |
| ret | |
| ; VMCS (Virtual Machine Control Structure) setup | |
| setup_vmcs: | |
| ; Allocate 4KB aligned VMCS region | |
| mov rdi, 4096 | |
| mov rsi, 4096 ; Alignment | |
| call allocate_aligned_memory | |
| mov [vmcs_region], rax | |
| ; Write VMCS revision identifier | |
| mov rdi, rax | |
| mov eax, [vmcs_revision_id] | |
| mov [rdi], eax | |
| ; Clear VMCS | |
| vmclear [vmcs_region] | |
| jc .vmclear_failed | |
| jz .vmclear_failed | |
| ; Load VMCS | |
| vmptrld [vmcs_region] | |
| jc .vmptrld_failed | |
| jz .vmptrld_failed | |
| ; Configure VMCS fields | |
| call setup_vmcs_host_state | |
| call setup_vmcs_guest_state | |
| call setup_vmcs_controls | |
| ret | |
| .vmclear_failed: | |
| .vmptrld_failed: | |
| mov rax, -1 | |
| ret | |
| ; VMCS host state setup | |
| setup_vmcs_host_state: | |
| ; Host CR0 | |
| mov rax, cr0 | |
| mov rcx, 0x6C00 ; HOST_CR0 | |
| vmwrite rcx, rax | |
| ; Host CR3 | |
| mov rax, cr3 | |
| mov rcx, 0x6C02 ; HOST_CR3 | |
| vmwrite rcx, rax | |
| ; Host CR4 | |
| mov rax, cr4 | |
| mov rcx, 0x6C04 ; HOST_CR4 | |
| vmwrite rcx, rax | |
| ; Host RSP | |
| lea rax, [host_stack_top] | |
| mov rcx, 0x6C14 ; HOST_RSP | |
| vmwrite rcx, rax | |
| ; Host RIP (VM exit handler) | |
| lea rax, [vm_exit_handler] | |
| mov rcx, 0x6C16 ; HOST_RIP | |
| vmwrite rcx, rax | |
| ; Host segment selectors | |
| mov ax, cs | |
| mov rcx, 0x0C02 ; HOST_CS_SELECTOR | |
| vmwrite rcx, rax | |
| mov ax, ds | |
| mov rcx, 0x0C06 ; HOST_DS_SELECTOR | |
| vmwrite rcx, rax | |
| ; Continue with other segments... | |
| ret | |
| ; VM entry | |
| vm_enter: | |
| ; Check if already in VMX operation | |
| vmxon [vmxon_region] | |
| jc .vmxon_failed | |
| ; Load guest VMCS | |
| vmptrld [guest_vmcs] | |
| ; Launch or resume VM | |
| mov rax, [vm_launched] | |
| test rax, rax | |
| jnz .vm_resume | |
| ; First launch | |
| vmlaunch | |
| jmp .vm_entry_failed | |
| .vm_resume: | |
| vmresume | |
| .vm_entry_failed: | |
| ; VM entry failed, check error | |
| mov rcx, 0x4400 ; VM_INSTRUCTION_ERROR | |
| vmread rax, rcx | |
| ; Handle error... | |
| .vmxon_failed: | |
| ret | |
| ; VM exit handler | |
| vm_exit_handler: | |
| ; Save guest general purpose registers | |
| push rax | |
| push rcx | |
| push rdx | |
| push rbx | |
| push rbp | |
| push rsi | |
| push rdi | |
| push r8 | |
| push r9 | |
| push r10 | |
| push r11 | |
| push r12 | |
| push r13 | |
| push r14 | |
| push r15 | |
| ; Read exit reason | |
| mov rcx, 0x4402 ; EXIT_REASON | |
| vmread rax, rcx | |
| ; Dispatch based on exit reason | |
| and rax, 0xFFFF ; Basic exit reason | |
| cmp rax, EXIT_REASON_CPUID | |
| je .handle_cpuid | |
| cmp rax, EXIT_REASON_IO | |
| je .handle_io | |
| cmp rax, EXIT_REASON_MSR_READ | |
| je .handle_msr_read | |
| cmp rax, EXIT_REASON_MSR_WRITE | |
| je .handle_msr_write | |
| cmp rax, EXIT_REASON_EPT_VIOLATION | |
| je .handle_ept_violation | |
| ; ... other exit reasons ... | |
| .exit_dispatch_done: | |
| ; Restore registers | |
| pop r15 | |
| pop r14 | |
| pop r13 | |
| pop r12 | |
| pop r11 | |
| pop r10 | |
| pop r9 | |
| pop r8 | |
| pop rdi | |
| pop rsi | |
| pop rbp | |
| pop rbx | |
| pop rdx | |
| pop rcx | |
| pop rax | |
| ; Resume guest | |
| vmresume | |
| ; Should not reach here | |
| jmp vm_resume_failed | |
| .handle_cpuid: | |
| ; Emulate CPUID instruction | |
| mov rcx, 0x681E ; GUEST_RIP | |
| vmread rsi, rcx | |
| add rsi, 2 ; CPUID is 2 bytes | |
| vmwrite rcx, rsi | |
| ; Get CPUID leaf from guest RAX | |
| mov rcx, 0x6800 ; GUEST_RAX | |
| vmread rax, rcx | |
| ; Execute CPUID | |
| cpuid | |
| ; Write results back to guest | |
| mov rcx, 0x6800 ; GUEST_RAX | |
| vmwrite rcx, rax | |
| mov rcx, 0x6802 ; GUEST_RBX | |
| vmwrite rcx, rbx | |
| mov rcx, 0x6804 ; GUEST_RCX | |
| vmwrite rcx, rcx | |
| mov rcx, 0x6806 ; GUEST_RDX | |
| vmwrite rcx, rdx | |
| jmp .exit_dispatch_done</code></pre> | |
| <h4 id="extended-page-tables-ept"><strong>Extended Page Tables | |
| (EPT)</strong></h4> | |
| <pre class="assembly"><code>; EPT structure setup | |
| setup_ept: | |
| ; Allocate EPT PML4 table | |
| mov rdi, 4096 | |
| call allocate_page | |
| mov [ept_pml4], rax | |
| ; Clear EPT PML4 | |
| mov rdi, rax | |
| xor eax, eax | |
| mov rcx, 512 | |
| rep stosq | |
| ; Set up identity mapping for first 1GB | |
| call setup_ept_identity_1gb | |
| ; Set EPTP in VMCS | |
| mov rax, [ept_pml4] | |
| or rax, 0x1E ; Memory type = WB, page walk = 4 | |
| mov rcx, 0x201A ; EPT_POINTER | |
| vmwrite rcx, rax | |
| ; Enable EPT in secondary controls | |
| mov rcx, 0x401E ; SECONDARY_VM_EXEC_CONTROL | |
| vmread rax, rcx | |
| or rax, (1 << 1) ; Enable EPT | |
| vmwrite rcx, rax | |
| ret | |
| ; EPT violation handler | |
| handle_ept_violation: | |
| ; Read guest physical address | |
| mov rcx, 0x2400 ; GUEST_PHYSICAL_ADDRESS | |
| vmread rdi, rcx | |
| ; Read exit qualification | |
| mov rcx, 0x640A ; EXIT_QUALIFICATION | |
| vmread rsi, rcx | |
| ; Check violation type | |
| test rsi, 0x01 ; Read access | |
| jnz .handle_read | |
| test rsi, 0x02 ; Write access | |
| jnz .handle_write | |
| test rsi, 0x04 ; Execute access | |
| jnz .handle_execute | |
| .handle_read: | |
| .handle_write: | |
| .handle_execute: | |
| ; Map page if valid access | |
| call validate_guest_access | |
| test rax, rax | |
| jz .invalid_access | |
| ; Add EPT mapping | |
| mov rsi, rdi ; GPA | |
| mov rdx, [host_page] ; HPA | |
| mov rcx, EPT_READ | EPT_WRITE | EPT_EXECUTE | |
| call add_ept_mapping | |
| jmp .done | |
| .invalid_access: | |
| ; Inject exception to guest | |
| call inject_guest_page_fault | |
| .done: | |
| ret</code></pre> | |
| <h3 id="multi-core-and-multi-threading-security"><strong>14.3 Multi-Core | |
| and Multi-Threading Security</strong></h3> | |
| <h4 id="per-cpu-security-state"><strong>Per-CPU Security | |
| State</strong></h4> | |
| <pre class="assembly"><code>; Per-CPU security context | |
| struc cpu_security_context | |
| .cr3_kernel resq 1 ; Kernel page tables | |
| .cr3_user resq 1 ; User page tables (KPTI) | |
| .shadow_stack resq 1 ; CET shadow stack | |
| .gs_base_kernel resq 1 ; Kernel GS base | |
| .gs_base_user resq 1 ; User GS base | |
| .tss_addr resq 1 ; Task State Segment | |
| .ist_stacks resq 7 ; Interrupt Stack Table | |
| endstruc | |
| ; Initialize per-CPU security | |
| init_cpu_security: | |
| ; Get CPU ID | |
| mov rax, [gs:cpu_id] | |
| ; Calculate per-CPU data offset | |
| mov rcx, cpu_security_context_size | |
| mul rcx | |
| lea rdi, [cpu_security_contexts + rax] | |
| ; Set up kernel page tables with KPTI | |
| call create_kernel_page_tables | |
| mov [rdi + cpu_security_context.cr3_kernel], rax | |
| ; Create user page tables (minimal kernel mapping) | |
| call create_user_page_tables | |
| mov [rdi + cpu_security_context.cr3_user], rax | |
| ; Allocate IST stacks for critical exceptions | |
| mov rcx, 7 | |
| .alloc_ist: | |
| push rcx | |
| mov rdi, IST_STACK_SIZE | |
| call allocate_stack | |
| pop rcx | |
| mov [rdi + cpu_security_context.ist_stacks + rcx*8 - 8], rax | |
| loop .alloc_ist | |
| ; Set up TSS with IST | |
| call setup_tss_with_ist | |
| ret | |
| ; KPTI (Kernel Page Table Isolation) switching | |
| switch_to_user_cr3: | |
| push rax | |
| mov rax, [gs:cpu_security + cpu_security_context.cr3_user] | |
| mov cr3, rax | |
| pop rax | |
| ret | |
| switch_to_kernel_cr3: | |
| push rax | |
| mov rax, [gs:cpu_security + cpu_security_context.cr3_kernel] | |
| mov cr3, rax | |
| pop rax | |
| ret</code></pre> | |
| <h4 id="speculation-control"><strong>Speculation Control</strong></h4> | |
| <pre class="assembly"><code>; Mitigation for speculative execution vulnerabilities | |
| ; Indirect Branch Prediction Barrier | |
| ibpb_barrier: | |
| mov ecx, 0x49 ; IA32_PRED_CMD | |
| mov eax, 1 ; IBPB command | |
| xor edx, edx | |
| wrmsr | |
| ret | |
| ; Indirect Branch Restricted Speculation | |
| enable_ibrs: | |
| mov ecx, 0x48 ; IA32_SPEC_CTRL | |
| rdmsr | |
| or eax, 0x01 ; Set IBRS | |
| wrmsr | |
| ret | |
| ; Single Thread Indirect Branch Predictors | |
| enable_stibp: | |
| mov ecx, 0x48 ; IA32_SPEC_CTRL | |
| rdmsr | |
| or eax, 0x02 ; Set STIBP | |
| wrmsr | |
| ret | |
| ; Speculative Store Bypass Disable | |
| enable_ssbd: | |
| mov ecx, 0x48 ; IA32_SPEC_CTRL | |
| rdmsr | |
| or eax, 0x04 ; Set SSBD | |
| wrmsr | |
| ret | |
| ; Return Stack Buffer clearing | |
| rsb_clear: | |
| ; Fill RSB with known targets | |
| mov rcx, 32 ; RSB depth | |
| .fill_loop: | |
| call .rsb_target | |
| .rsb_target: | |
| pause | |
| dec rcx | |
| jnz .fill_loop | |
| ; Clear by doing equal calls/rets | |
| mov rcx, 32 | |
| .clear_loop: | |
| add rsp, 8 ; Pop without ret | |
| dec rcx | |
| jnz .clear_loop | |
| ret | |
| ; Microarchitectural Data Sampling mitigation | |
| mds_clear: | |
| ; Clear CPU buffers | |
| mov ecx, 0x10B ; IA32_FLUSH_CMD | |
| mov eax, 1 ; L1D_FLUSH command | |
| xor edx, edx | |
| wrmsr | |
| ; VERW instruction to clear other buffers | |
| verw word [ds:verw_operand] | |
| ret | |
| verw_operand: | |
| dw 0 | |
| ; Context switch security | |
| secure_context_switch: | |
| ; Clear speculation state | |
| call ibpb_barrier | |
| ; Clear microarchitectural state | |
| call mds_clear | |
| ; Switch page tables (KPTI) | |
| mov rax, [next_task] | |
| mov rax, [rax + task.cr3] | |
| mov cr3, rax | |
| ; Clear sensitive registers | |
| xor rax, rax | |
| xor rcx, rcx | |
| xor rdx, rdx | |
| xor rsi, rsi | |
| xor rdi, rdi | |
| xor r8, r8 | |
| xor r9, r9 | |
| xor r10, r10 | |
| xor r11, r11 | |
| ret</code></pre> | |
| <h3 id="secure-coding-practices"><strong>14.4 Secure Coding | |
| Practices</strong></h3> | |
| <h4 id="stack-protection"><strong>Stack Protection</strong></h4> | |
| <pre class="assembly"><code>; Function with stack canary | |
| secure_function: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 0x100 ; Local variables | |
| ; Place canary | |
| mov rax, [gs:stack_canary] | |
| mov [rbp-8], rax | |
| ; Function body | |
| ; ... code ... | |
| ; Check canary before return | |
| mov rax, [rbp-8] | |
| cmp rax, [gs:stack_canary] | |
| jne .stack_smash_detected | |
| leave | |
| ret | |
| .stack_smash_detected: | |
| call __stack_chk_fail | |
| ; Never returns | |
| ; ROP protection with CET | |
| cet_protected_function: | |
| endbr64 ; Valid indirect branch target | |
| ; Function prologue with shadow stack | |
| push rbp | |
| mov rbp, rsp | |
| ; Body | |
| ; ... code ... | |
| ; Return - automatically verified against shadow stack | |
| leave | |
| ret | |
| ; Control Flow Guard implementation | |
| cfg_check: | |
| ; Before indirect call | |
| ; RCX = target address | |
| push rax | |
| push rdx | |
| ; Hash target address | |
| mov rax, rcx | |
| shr rax, 4 ; Align to 16 bytes | |
| ; Check in CFG bitmap | |
| mov rdx, [cfg_bitmap_base] | |
| bt [rdx], rax | |
| jnc .invalid_target | |
| pop rdx | |
| pop rax | |
| ; Safe to call | |
| call rcx | |
| ret | |
| .invalid_target: | |
| ; CFG violation | |
| call cfg_violation_handler | |
| ; Never returns</code></pre> | |
| <h4 id="secure-memory-operations"><strong>Secure Memory | |
| Operations</strong></h4> | |
| <pre class="assembly"><code>; Constant-time memory comparison | |
| constant_time_compare: | |
| ; RDI = ptr1, RSI = ptr2, RDX = length | |
| xor rax, rax ; Result accumulator | |
| xor rcx, rcx ; Counter | |
| .loop: | |
| mov r8b, [rdi + rcx] | |
| xor r8b, [rsi + rcx] | |
| or al, r8b | |
| inc rcx | |
| cmp rcx, rdx | |
| jb .loop | |
| ; RAX = 0 if equal, non-zero if different | |
| test rax, rax | |
| setz al | |
| movzx rax, al | |
| ret | |
| ; Secure memory clear | |
| secure_zero_memory: | |
| ; RDI = buffer, RSI = length | |
| push rcx | |
| ; Clear in multiple passes | |
| mov rcx, 3 ; Number of passes | |
| .pass_loop: | |
| push rcx | |
| push rdi | |
| push rsi | |
| ; Overwrite with pattern | |
| mov rax, 0x5555555555555555 | |
| mov rcx, rsi | |
| shr rcx, 3 ; Convert to qwords | |
| rep stosq | |
| ; Handle remaining bytes | |
| mov rcx, rsi | |
| and rcx, 7 | |
| mov al, 0x55 | |
| rep stosb | |
| pop rsi | |
| pop rdi | |
| pop rcx | |
| ; Different pattern each pass | |
| ror rax, 1 | |
| loop .pass_loop | |
| ; Final zero pass | |
| xor rax, rax | |
| mov rcx, rsi | |
| shr rcx, 3 | |
| rep stosq | |
| mov rcx, rsi | |
| and rcx, 7 | |
| rep stosb | |
| ; Memory barrier | |
| mfence | |
| pop rcx | |
| ret | |
| ; Bounds checking | |
| checked_array_access: | |
| ; RDI = array base, RSI = index, RDX = element size | |
| ; RCX = array bound | |
| ; Check bounds | |
| cmp rsi, rcx | |
| jae .out_of_bounds | |
| ; Calculate address with overflow check | |
| mov rax, rsi | |
| mul rdx | |
| jo .overflow | |
| add rax, rdi | |
| jc .overflow | |
| ; Safe to access | |
| ret | |
| .out_of_bounds: | |
| .overflow: | |
| ; Return error | |
| mov rax, -1 | |
| ret</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>This chapter covered hardware security features including NX, | |
| SMAP/SMEP, CET, and SGX, along with virtualization technologies like | |
| VT-x and EPT. We explored secure coding practices, speculation | |
| mitigations, and memory protection techniques essential for writing | |
| secure x86-64 code. The examples demonstrate practical implementations | |
| of these security features at the assembly level.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-15-performance-optimization-techniques">Chapter 15: | |
| Performance Optimization Techniques</h2> | |
| <h3 id="microarchitectural-optimization-fundamentals">15.1 | |
| Microarchitectural Optimization Fundamentals</h3> | |
| <h4 id="understanding-the-modern-x86-64-pipeline">Understanding the | |
| Modern x86-64 Pipeline</h4> | |
| <p>Modern x86-64 processors are superscalar, out-of-order execution | |
| machines with deep pipelines (typically 14-20 stages). Key components | |
| affecting performance:</p> | |
| <pre class="assembly"><code>; Pipeline stages example - Intel Skylake | |
| ; 1. Instruction Fetch (IF) | |
| ; 2. Instruction Length Decode (ILD) | |
| ; 3. Instruction Queue (IQ) | |
| ; 4. Instruction Decode (ID) - up to 4 instructions/cycle | |
| ; 5. Micro-op Queue | |
| ; 6. Register Rename/Allocate | |
| ; 7. Scheduler (RS - Reservation Station) | |
| ; 8. Execution Units (Multiple ports) | |
| ; 9. Writeback | |
| ; 10. Retire/Commit</code></pre> | |
| <h4 id="execution-ports-and-throughput">Execution Ports and | |
| Throughput</h4> | |
| <p>Intel processors have multiple execution ports (Skylake has 8 | |
| ports):</p> | |
| <pre class="assembly"><code>; Port utilization example | |
| ; Port 0: ALU, MUL, DIV, Branch | |
| ; Port 1: ALU, MUL, Fast LEA | |
| ; Port 2/3: Load | |
| ; Port 4: Store Data | |
| ; Port 5: ALU, Shuffle | |
| ; Port 6: ALU, Branch | |
| ; Port 7: Store Address | |
| ; Optimized code distributes across ports | |
| vmulps ymm0, ymm1, ymm2 ; Port 0/1 | |
| vaddps ymm3, ymm4, ymm5 ; Port 0/1/5 | |
| vmovaps [rdi], ymm0 ; Port 2/3 + 4/7 | |
| ; These can execute in parallel if no dependencies</code></pre> | |
| <h3 id="branch-prediction-optimization">15.2 Branch Prediction | |
| Optimization</h3> | |
| <h4 id="static-branch-prediction">Static Branch Prediction</h4> | |
| <pre class="assembly"><code>; Compiler hints for likely/unlikely branches | |
| section .text | |
| cmp rax, rbx | |
| jg .likely_path ; Forward branches predicted not-taken | |
| ; unlikely code... | |
| jmp .continue | |
| .likely_path: | |
| ; hot path code... | |
| .continue: | |
| ; Using conditional moves to avoid branches | |
| cmp eax, ebx | |
| mov ecx, 0 ; Default value | |
| mov edx, 1 ; Alternative value | |
| cmovg ecx, edx ; No branch, no misprediction penalty</code></pre> | |
| <h4 id="loop-optimization-and-unrolling">Loop Optimization and | |
| Unrolling</h4> | |
| <pre class="assembly"><code>; Original loop - 1 iteration per cycle best case | |
| .loop: | |
| movsd xmm0, [rsi] | |
| mulsd xmm0, xmm7 | |
| movsd [rdi], xmm0 | |
| add rsi, 8 | |
| add rdi, 8 | |
| dec rcx | |
| jnz .loop | |
| ; Unrolled 4x with software pipelining | |
| .loop_unrolled: | |
| movsd xmm0, [rsi] | |
| movsd xmm1, [rsi+8] | |
| movsd xmm2, [rsi+16] | |
| movsd xmm3, [rsi+24] | |
| mulsd xmm0, xmm7 | |
| mulsd xmm1, xmm7 | |
| mulsd xmm2, xmm7 | |
| mulsd xmm3, xmm7 | |
| movsd [rdi], xmm0 | |
| movsd [rdi+8], xmm1 | |
| movsd [rdi+16], xmm2 | |
| movsd [rdi+24], xmm3 | |
| add rsi, 32 | |
| add rdi, 32 | |
| sub rcx, 4 | |
| jnz .loop_unrolled</code></pre> | |
| <h3 id="memory-access-optimization">15.3 Memory Access Optimization</h3> | |
| <h4 id="cache-line-optimization">Cache Line Optimization</h4> | |
| <pre class="assembly"><code>; Cache line is 64 bytes on modern x86-64 | |
| ; Align data structures to cache line boundaries | |
| section .data | |
| align 64 | |
| matrix: times 1024 dq 0.0 ; 64-byte aligned | |
| section .text | |
| ; Prefetching for sequential access | |
| .prefetch_loop: | |
| prefetcht0 [rsi + 256] ; Prefetch 4 cache lines ahead | |
| prefetcht0 [rsi + 320] | |
| ; Process current cache line | |
| vmovaps ymm0, [rsi] | |
| vmovaps ymm1, [rsi+32] | |
| ; ... processing ... | |
| add rsi, 64 | |
| dec rcx | |
| jnz .prefetch_loop</code></pre> | |
| <h4 id="non-temporal-stores-streaming-stores">Non-Temporal Stores | |
| (Streaming Stores)</h4> | |
| <pre class="assembly"><code>; Bypass cache for large streaming writes | |
| ; Reduces cache pollution | |
| .streaming_copy: | |
| vmovaps ymm0, [rsi] | |
| vmovaps ymm1, [rsi+32] | |
| vmovaps ymm2, [rsi+64] | |
| vmovaps ymm3, [rsi+96] | |
| ; Non-temporal stores bypass cache | |
| vmovntps [rdi], ymm0 | |
| vmovntps [rdi+32], ymm1 | |
| vmovntps [rdi+64], ymm2 | |
| vmovntps [rdi+96], ymm3 | |
| add rsi, 128 | |
| add rdi, 128 | |
| sub rcx, 128 | |
| jnz .streaming_copy | |
| sfence ; Ensure stores complete before continuing</code></pre> | |
| <h3 id="simd-vectorization-techniques">15.4 SIMD Vectorization | |
| Techniques</h3> | |
| <h4 id="auto-vectorization-patterns">Auto-Vectorization Patterns</h4> | |
| <pre class="assembly"><code>; Scalar addition loop | |
| scalar_add: | |
| movss xmm0, [rsi] | |
| addss xmm0, [rdx] | |
| movss [rdi], xmm0 | |
| add rsi, 4 | |
| add rdx, 4 | |
| add rdi, 4 | |
| dec rcx | |
| jnz scalar_add | |
| ; Vectorized with AVX2 | |
| vector_add_avx2: | |
| vmovaps ymm0, [rsi] ; Load 8 floats | |
| vaddps ymm0, ymm0, [rdx] ; Add 8 floats | |
| vmovaps [rdi], ymm0 ; Store 8 floats | |
| add rsi, 32 | |
| add rdx, 32 | |
| add rdi, 32 | |
| sub rcx, 8 | |
| jnz vector_add_avx2 | |
| ; AVX-512 version with masking for remainder | |
| vector_add_avx512: | |
| mov rax, rcx | |
| and rax, ~15 ; Process 16 elements at a time | |
| jz .remainder | |
| .main_loop: | |
| vmovaps zmm0, [rsi] ; Load 16 floats | |
| vaddps zmm0, zmm0, [rdx] | |
| vmovaps [rdi], zmm0 | |
| add rsi, 64 | |
| add rdx, 64 | |
| add rdi, 64 | |
| sub rax, 16 | |
| jnz .main_loop | |
| .remainder: | |
| and rcx, 15 ; Remainder count | |
| jz .done | |
| mov rbx, -1 | |
| bzhi rbx, rbx, rcx ; Create mask | |
| kmovq k1, rbx | |
| vmovaps zmm0{k1}{z}, [rsi] | |
| vaddps zmm0{k1}, zmm0, [rdx] | |
| vmovaps [rdi]{k1}, zmm0 | |
| .done:</code></pre> | |
| <h4 id="fma-fused-multiply-add-optimization">FMA (Fused Multiply-Add) | |
| Optimization</h4> | |
| <pre class="assembly"><code>; SAXPY: Y = a*X + Y | |
| ; Scalar version | |
| saxpy_scalar: | |
| movss xmm0, [rsi] | |
| mulss xmm0, xmm7 ; xmm7 contains scalar 'a' | |
| addss xmm0, [rdi] | |
| movss [rdi], xmm0 | |
| ; FMA version - single instruction, better accuracy | |
| saxpy_fma: | |
| vmovaps ymm0, [rdi] | |
| vfmadd213ps ymm0, ymm7, [rsi] ; ymm0 = ymm7 * [rsi] + ymm0 | |
| vmovaps [rdi], ymm0</code></pre> | |
| <h3 id="instruction-level-parallelism">15.5 Instruction-Level | |
| Parallelism</h3> | |
| <h4 id="dependency-chain-breaking">Dependency Chain Breaking</h4> | |
| <pre class="assembly"><code>; Poor: Long dependency chain | |
| mov rax, [rsi] | |
| add rax, 1 | |
| add rax, [rsi+8] | |
| add rax, [rsi+16] | |
| add rax, [rsi+24] | |
| ; Better: Multiple accumulator chains | |
| mov rax, [rsi] | |
| mov rbx, [rsi+8] | |
| mov rcx, [rsi+16] | |
| mov rdx, [rsi+24] | |
| add rax, 1 | |
| add rax, rbx | |
| add rcx, rdx | |
| add rax, rcx ; Shorter critical path</code></pre> | |
| <h4 id="software-pipelining">Software Pipelining</h4> | |
| <pre class="assembly"><code>; Matrix multiplication with software pipelining | |
| ; Overlap loads with computation | |
| matrix_multiply_optimized: | |
| ; Preload first iteration | |
| vmovaps zmm0, [rsi] | |
| vmovaps zmm1, [rdx] | |
| .loop: | |
| ; Current iteration computation | |
| vfmadd231ps zmm16, zmm0, zmm1 | |
| ; Prefetch next iteration while computing | |
| vmovaps zmm2, [rsi + 64] | |
| vmovaps zmm3, [rdx + 64] | |
| vfmadd231ps zmm17, zmm0, zmm3 | |
| vfmadd231ps zmm18, zmm2, zmm1 | |
| ; Move to next iteration | |
| vmovaps zmm0, zmm2 | |
| vmovaps zmm1, zmm3 | |
| add rsi, 64 | |
| add rdx, 64 | |
| dec rcx | |
| jnz .loop</code></pre> | |
| <h3 id="code-size-and-alignment-optimization">15.6 Code Size and | |
| Alignment Optimization</h3> | |
| <h4 id="function-and-loop-alignment">Function and Loop Alignment</h4> | |
| <pre class="assembly"><code>; Align functions to 16-byte boundaries | |
| align 16 | |
| optimized_function: | |
| ; Function code... | |
| ; Align hot loops to 32-byte boundaries for fetch | |
| align 32 | |
| .hot_loop: | |
| ; Critical loop body | |
| ; Keep under 32 bytes if possible for μop cache</code></pre> | |
| <h4 id="instruction-selection-for-size">Instruction Selection for | |
| Size</h4> | |
| <pre class="assembly"><code>; Prefer shorter encodings | |
| xor eax, eax ; 2 bytes - better than mov eax, 0 (5 bytes) | |
| inc rsi ; 3 bytes - avoid if breaking fusion | |
| add rsi, 1 ; 4 bytes - but allows macro-fusion with jnz | |
| ; Use VEX encoding when beneficial | |
| vxorps xmm0, xmm0, xmm0 ; Clears upper bits, avoids transition penalty</code></pre> | |
| <h3 id="profile-guided-optimization">15.7 Profile-Guided | |
| Optimization</h3> | |
| <h4 id="using-performance-counters">Using Performance Counters</h4> | |
| <pre class="assembly"><code>; Example: Measuring cache misses | |
| ; Use rdpmc instruction to read performance counters | |
| read_perf_counter: | |
| mov ecx, 0 ; Counter index | |
| rdpmc ; Read counter into EDX:EAX | |
| shl rdx, 32 | |
| or rax, rdx ; Full 64-bit count in RAX | |
| ret | |
| ; Instrument critical section | |
| call read_perf_counter | |
| mov [start_count], rax | |
| ; Critical code section | |
| call optimized_function | |
| call read_perf_counter | |
| sub rax, [start_count] | |
| ; RAX now contains event count</code></pre> | |
| <h3 id="practical-optimization-example">15.8 Practical Optimization | |
| Example</h3> | |
| <p>Here’s a complete optimized memory copy function using various | |
| techniques:</p> | |
| <pre class="assembly"><code>; High-performance memory copy | |
| ; RDI = destination, RSI = source, RDX = size in bytes | |
| global fast_memcpy | |
| fast_memcpy: | |
| cmp rdx, 64 | |
| jb .small_copy | |
| cmp rdx, 2048 | |
| jb .medium_copy | |
| .large_copy: | |
| ; Use non-temporal stores for large copies | |
| mov rcx, rdx | |
| shr rcx, 6 ; Divide by 64 | |
| .large_loop: | |
| prefetchnta [rsi + 256] | |
| vmovdqa ymm0, [rsi] | |
| vmovdqa ymm1, [rsi + 32] | |
| vmovntdq [rdi], ymm0 | |
| vmovntdq [rdi + 32], ymm1 | |
| add rsi, 64 | |
| add rdi, 64 | |
| dec rcx | |
| jnz .large_loop | |
| sfence | |
| and rdx, 63 ; Handle remainder | |
| jz .done | |
| .medium_copy: | |
| ; Use regular moves for medium sizes | |
| mov rcx, rdx | |
| shr rcx, 5 ; Divide by 32 | |
| jz .small_copy | |
| .medium_loop: | |
| vmovdqu ymm0, [rsi] | |
| vmovdqu [rdi], ymm0 | |
| add rsi, 32 | |
| add rdi, 32 | |
| dec rcx | |
| jnz .medium_loop | |
| and rdx, 31 | |
| .small_copy: | |
| ; Handle remainder with scalar moves | |
| mov rcx, rdx | |
| rep movsb | |
| .done: | |
| vzeroupper ; Clear upper YMM bits | |
| ret</code></pre> | |
| <h3 id="performance-analysis-tools">15.9 Performance Analysis Tools</h3> | |
| <h4 id="intel-vtune-profiler-integration">Intel VTune Profiler | |
| Integration</h4> | |
| <pre class="assembly"><code>; Markers for VTune analysis | |
| %include "ittnotify.h" | |
| section .data | |
| domain_name db "MyApp", 0 | |
| task_name db "CriticalLoop", 0 | |
| section .text | |
| ; Create domain | |
| lea rdi, [domain_name] | |
| call __itt_domain_create | |
| mov [domain_handle], rax | |
| ; Start task | |
| lea rdi, [task_name] | |
| mov rsi, [domain_handle] | |
| call __itt_task_begin | |
| ; Critical code here | |
| call optimized_function | |
| ; End task | |
| mov rdi, [domain_handle] | |
| call __itt_task_end</code></pre> | |
| <h3 id="summary">Summary</h3> | |
| <p>Performance optimization in x86-64 assembly requires | |
| understanding:</p> | |
| <ol type="1"> | |
| <li><p><strong>Microarchitecture</strong>: Pipeline stages, execution | |
| ports, and μop cache</p></li> | |
| <li><p><strong>Memory Hierarchy</strong>: Cache lines, prefetching, and | |
| bandwidth limitations</p></li> | |
| <li><p><strong>Instruction Selection</strong>: Choosing optimal | |
| instructions for specific scenarios</p></li> | |
| <li><p><strong>Parallelism</strong>: Both instruction-level and | |
| data-level (SIMD)</p></li> | |
| <li><p><strong>Measurement</strong>: Using performance counters and | |
| profiling tools</p></li> | |
| </ol> | |
| <p>Key principles:</p> | |
| <ul> | |
| <li><p>Minimize branch mispredictions</p></li> | |
| <li><p>Maximize instruction-level parallelism</p></li> | |
| <li><p>Optimize memory access patterns</p></li> | |
| <li><p>Use SIMD effectively</p></li> | |
| <li><p>Profile and measure actual performance</p></li> | |
| </ul> | |
| <p>Modern compilers handle many optimizations automatically, but | |
| critical hot paths often benefit from hand-tuned assembly, especially | |
| when using advanced features like AVX-512 or specialized | |
| instructions.</p> | |
| <h3 id="exercises">Exercises</h3> | |
| <ol type="1"> | |
| <li><p>Implement an optimized matrix transpose using AVX-512</p></li> | |
| <li><p>Write a branch-free binary search using conditional | |
| moves</p></li> | |
| <li><p>Create a high-performance string comparison function using | |
| SIMD</p></li> | |
| <li><p>Optimize a hash table lookup to minimize cache misses</p></li> | |
| <li><p>Profile and optimize a real-world function using performance | |
| counters</p></li> | |
| </ol> | |
| <p>Next, we’ll explore Chapter 16: Advanced Topics, covering JIT | |
| compilation, dynamic code generation, and formal verification | |
| methods.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="chapter-16-code-generation-and-compiler-backend">Chapter 16: | |
| Code Generation and Compiler Backend</h2> | |
| <p>Building on our performance optimization techniques, this chapter | |
| explores how compilers generate x86-64 machine code and how to implement | |
| compiler backends, JIT compilation, and dynamic code generation.</p> | |
| <h3 id="compiler-architecture-overview">16.1 Compiler Architecture | |
| Overview</h3> | |
| <h4 id="compilation-pipeline">Compilation Pipeline</h4> | |
| <div class="sourceCode" id="cb210"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb210-1"><a href="#cb210-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Typical compiler phases</span></span> | |
| <span id="cb210-2"><a href="#cb210-2" aria-hidden="true" tabindex="-1"></a><span class="co">// Source Code → Lexer → Parser → AST → IR → Optimization → Code Gen → Assembly</span></span></code></pre></div> | |
| <p>The backend focuses on the final stages: IR (Intermediate | |
| Representation) to assembly code generation.</p> | |
| <pre class="assembly"><code>; Example: Simple expression tree to x86-64 | |
| ; Expression: (a + b) * (c - d) | |
| ; Assuming a=RDI, b=RSI, c=RDX, d=RCX | |
| code_gen_expression: | |
| ; Generate code for left subtree (a + b) | |
| mov rax, rdi ; Load a | |
| add rax, rsi ; Add b | |
| ; Generate code for right subtree (c - d) | |
| mov r10, rdx ; Load c | |
| sub r10, rcx ; Subtract d | |
| ; Combine results | |
| imul rax, r10 ; Multiply results | |
| ret</code></pre> | |
| <h3 id="register-allocation">16.2 Register Allocation</h3> | |
| <h4 id="graph-coloring-algorithm">Graph Coloring Algorithm</h4> | |
| <pre class="assembly"><code>; Register allocation example | |
| ; Variables: v1, v2, v3, v4, v5 | |
| ; Live ranges determine interference graph | |
| section .text | |
| ; Before register allocation (pseudo-code) | |
| ; LOAD v1, [mem1] | |
| ; LOAD v2, [mem2] | |
| ; ADD v3, v1, v2 | |
| ; LOAD v4, [mem3] | |
| ; MUL v5, v3, v4 | |
| ; STORE [result], v5 | |
| ; After register allocation | |
| register_allocated: | |
| mov rax, [mem1] ; v1 → RAX | |
| mov rbx, [mem2] ; v2 → RBX | |
| add rax, rbx ; v3 → RAX (reuse v1's register) | |
| mov rcx, [mem3] ; v4 → RCX | |
| imul rax, rcx ; v5 → RAX (reuse v3's register) | |
| mov [result], rax | |
| ret</code></pre> | |
| <h4 id="spill-code-generation">Spill Code Generation</h4> | |
| <pre class="assembly"><code>; When registers are exhausted, spill to stack | |
| spill_example: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 32 ; Allocate spill slots | |
| ; Too many live variables for available registers | |
| mov rax, [input1] | |
| mov rbx, [input2] | |
| mov rcx, [input3] | |
| mov rdx, [input4] | |
| mov rsi, [input5] | |
| mov rdi, [input6] | |
| mov r8, [input7] | |
| mov r9, [input8] | |
| ; Need more variables - spill | |
| mov [rbp-8], rax ; Spill v1 | |
| mov rax, [input9] ; Load v9 | |
| ; ... use rax ... | |
| mov [rbp-16], rax ; Spill v9 | |
| mov rax, [rbp-8] ; Reload v1 | |
| mov rsp, rbp | |
| pop rbp | |
| ret</code></pre> | |
| <h3 id="instruction-selection">16.3 Instruction Selection</h3> | |
| <h4 id="pattern-matching-and-tiling">Pattern Matching and Tiling</h4> | |
| <pre class="assembly"><code>; Instruction selector patterns | |
| ; Pattern: memory_operand + register → single instruction | |
| ; Naive code generation | |
| naive_add: | |
| mov rax, [mem_addr] | |
| add rax, rbx | |
| mov [mem_addr], rax | |
| ; Optimized selection using memory operands | |
| optimized_add: | |
| add [mem_addr], rbx ; Single instruction | |
| ; Complex addressing mode selection | |
| ; Array access: arr[i*8 + j] | |
| array_access: | |
| ; Could generate: | |
| mov rax, rsi ; i | |
| shl rax, 3 ; i*8 | |
| add rax, rdx ; i*8 + j | |
| mov rcx, [rdi + rax]; arr[i*8 + j] | |
| ; Better selection: | |
| mov rcx, [rdi + rsi*8 + rdx] ; Single instruction</code></pre> | |
| <h4 id="peephole-optimization">Peephole Optimization</h4> | |
| <pre class="assembly"><code>; Common peephole patterns | |
| section .text | |
| ; Pattern: Push followed by pop | |
| ; Before: | |
| push rax | |
| pop rbx | |
| ; After: | |
| mov rbx, rax | |
| ; Pattern: Redundant moves | |
| ; Before: | |
| mov rax, rbx | |
| mov rbx, rax | |
| ; After: | |
| mov rax, rbx | |
| ; Pattern: Constant folding | |
| ; Before: | |
| mov rax, 5 | |
| add rax, 3 | |
| ; After: | |
| mov rax, 8 | |
| ; Pattern: Strength reduction | |
| ; Before: | |
| imul rax, 2 | |
| ; After: | |
| add rax, rax ; Or: shl rax, 1</code></pre> | |
| <h3 id="jit-compilation-implementation">16.4 JIT Compilation | |
| Implementation</h3> | |
| <h4 id="basic-jit-compiler-structure">Basic JIT Compiler Structure</h4> | |
| <div class="sourceCode" id="cb216"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb216-1"><a href="#cb216-1" aria-hidden="true" tabindex="-1"></a><span class="co">// C structure for JIT compiler</span></span> | |
| <span id="cb216-2"><a href="#cb216-2" aria-hidden="true" tabindex="-1"></a><span class="kw">typedef</span> <span class="kw">struct</span> <span class="op">{</span></span> | |
| <span id="cb216-3"><a href="#cb216-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">uint8_t</span><span class="op">*</span> code_buffer<span class="op">;</span></span> | |
| <span id="cb216-4"><a href="#cb216-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">size_t</span> buffer_size<span class="op">;</span></span> | |
| <span id="cb216-5"><a href="#cb216-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">size_t</span> current_pos<span class="op">;</span></span> | |
| <span id="cb216-6"><a href="#cb216-6" aria-hidden="true" tabindex="-1"></a><span class="op">}</span> JitCompiler<span class="op">;</span></span></code></pre></div> | |
| <pre class="assembly"><code>; Runtime code generation example | |
| ; Generate function dynamically that adds constant | |
| section .data | |
| code_buffer: resb 4096 | |
| section .text | |
| generate_adder: | |
| ; Input: RDI = constant to add | |
| ; Output: RAX = pointer to generated function | |
| lea rax, [code_buffer] | |
| ; Generate: mov rax, rdi | |
| mov byte [rax], 0x48 | |
| mov byte [rax+1], 0x89 | |
| mov byte [rax+2], 0xF8 | |
| ; Generate: add rax, CONSTANT | |
| mov byte [rax+3], 0x48 | |
| mov byte [rax+4], 0x05 | |
| mov dword [rax+5], edi ; Embed constant | |
| ; Generate: ret | |
| mov byte [rax+9], 0xC3 | |
| ; Make code executable | |
| mov rdi, rax | |
| mov rsi, 4096 | |
| mov rdx, 7 ; PROT_READ | PROT_WRITE | PROT_EXEC | |
| call mprotect | |
| lea rax, [code_buffer] | |
| ret</code></pre> | |
| <h4 id="advanced-jit-with-templates">Advanced JIT with Templates</h4> | |
| <pre class="assembly"><code>; Template-based code generation | |
| section .data | |
| ; Template for comparison function | |
| template_start: | |
| cmp rdi, 0x12345678 ; Placeholder for constant | |
| jg .greater | |
| jl .less | |
| xor eax, eax ; Equal | |
| ret | |
| .greater: | |
| mov eax, 1 | |
| ret | |
| .less: | |
| mov eax, -1 | |
| ret | |
| template_end: | |
| section .text | |
| generate_comparator: | |
| ; Input: RDI = constant to compare against | |
| ; Copy template | |
| mov rsi, template_start | |
| mov rcx, template_end - template_start | |
| lea rdx, [code_buffer] | |
| .copy_loop: | |
| mov al, [rsi] | |
| mov [rdx], al | |
| inc rsi | |
| inc rdx | |
| dec rcx | |
| jnz .copy_loop | |
| ; Patch constant | |
| lea rax, [code_buffer + 2] ; Offset to constant | |
| mov [rax], edi | |
| ; Return pointer to generated code | |
| lea rax, [code_buffer] | |
| ret</code></pre> | |
| <h3 id="dynamic-binary-translation">16.5 Dynamic Binary Translation</h3> | |
| <h4 id="self-modifying-code">Self-Modifying Code</h4> | |
| <pre class="assembly"><code>; Self-modifying code example | |
| ; Dynamically optimize based on runtime behavior | |
| section .data | |
| branch_counter: dq 0 | |
| threshold: dq 1000 | |
| section .text | |
| adaptive_branch: | |
| ; Increment counter | |
| inc qword [branch_counter] | |
| ; Check if we should optimize | |
| mov rax, [branch_counter] | |
| cmp rax, [threshold] | |
| jl .normal_path | |
| ; Rewrite this code based on statistics | |
| call optimize_hot_path | |
| .normal_path: | |
| ; Original code | |
| test rdi, rdi | |
| jz .zero_case | |
| jmp .nonzero_case | |
| .zero_case: | |
| ; Handle zero | |
| ret | |
| .nonzero_case: | |
| ; Handle non-zero | |
| ret | |
| optimize_hot_path: | |
| ; Analyze branch statistics and rewrite | |
| ; the jump instruction to use likely path | |
| push rbp | |
| mov rbp, rsp | |
| ; Change page permissions for writing | |
| lea rdi, [adaptive_branch] | |
| mov rsi, 4096 | |
| mov rdx, 7 ; PROT_READ | PROT_WRITE | PROT_EXEC | |
| call mprotect | |
| ; Rewrite branch (example: change jz to jnz) | |
| lea rax, [adaptive_branch.normal_path] | |
| mov byte [rax+6], 0x75 ; Change to JNZ | |
| mov rsp, rbp | |
| pop rbp | |
| ret</code></pre> | |
| <h3 id="machine-code-encoding">16.6 Machine Code Encoding</h3> | |
| <h4 id="x86-64-instruction-encoding">x86-64 Instruction Encoding</h4> | |
| <pre class="assembly"><code>; Understanding x86-64 encoding | |
| ; Format: [Prefixes] [REX] [Opcode] [ModR/M] [SIB] [Displacement] [Immediate] | |
| section .text | |
| encode_instruction: | |
| ; Example: Encode "add rax, rbx" manually | |
| ; REX prefix: 0x48 (W=1 for 64-bit) | |
| ; Opcode: 0x01 (ADD r/m64, r64) | |
| ; ModR/M: 0xD8 (mod=11, reg=011, r/m=000) | |
| db 0x48, 0x01, 0xD8 | |
| ; Example: Encode "mov r13, [r14 + rax*8 + 0x100]" | |
| ; REX: 0x4D (W=1, R=1, B=1) | |
| ; Opcode: 0x8B (MOV r64, r/m64) | |
| ; ModR/M: 0xAC (mod=10, reg=101, r/m=100) | |
| ; SIB: 0xC6 (scale=11, index=000, base=110) | |
| ; Disp32: 0x00010000 | |
| db 0x4D, 0x8B, 0xAC, 0xC6 | |
| dd 0x100</code></pre> | |
| <h4 id="building-an-assembler">Building an Assembler</h4> | |
| <pre class="assembly"><code>; Simple assembler implementation | |
| section .data | |
| mnemonic_table: | |
| db "ADD", 0, 0x01 | |
| db "SUB", 0, 0x29 | |
| db "MOV", 0, 0x89 | |
| db "CMP", 0, 0x39 | |
| section .text | |
| assemble_instruction: | |
| ; Input: RSI = instruction string | |
| ; Output: RDI = encoded bytes | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 32 | |
| ; Parse mnemonic | |
| call parse_mnemonic | |
| mov [rbp-8], rax ; Store opcode | |
| ; Parse operands | |
| call parse_operands | |
| mov [rbp-16], rax ; Store ModR/M byte | |
| ; Generate REX prefix if needed | |
| call generate_rex | |
| mov [rbp-24], rax | |
| ; Emit instruction | |
| mov rdi, [output_buffer] | |
| ; Write REX if present | |
| cmp byte [rbp-24], 0 | |
| je .no_rex | |
| mov al, [rbp-24] | |
| stosb | |
| .no_rex: | |
| ; Write opcode | |
| mov al, [rbp-8] | |
| stosb | |
| ; Write ModR/M | |
| mov al, [rbp-16] | |
| stosb | |
| mov rsp, rbp | |
| pop rbp | |
| ret</code></pre> | |
| <h3 id="optimization-pass-implementation">16.7 Optimization Pass | |
| Implementation</h3> | |
| <h4 id="dead-code-elimination">Dead Code Elimination</h4> | |
| <pre class="assembly"><code>; Dead code elimination pass | |
| ; Analyze and remove unreachable code | |
| dead_code_elimination: | |
| push rbp | |
| mov rbp, rsp | |
| ; Build control flow graph | |
| call build_cfg | |
| ; Mark reachable blocks | |
| lea rdi, [entry_block] | |
| call mark_reachable | |
| ; Remove unmarked blocks | |
| lea rsi, [block_list] | |
| .remove_loop: | |
| mov rax, [rsi] | |
| test rax, rax | |
| jz .done | |
| ; Check if marked | |
| test byte [rax + block.flags], REACHABLE | |
| jnz .keep | |
| ; Remove block | |
| call remove_block | |
| .keep: | |
| add rsi, 8 | |
| jmp .remove_loop | |
| .done: | |
| mov rsp, rbp | |
| pop rbp | |
| ret</code></pre> | |
| <h4 id="constant-propagation">Constant Propagation</h4> | |
| <pre class="assembly"><code>; Constant propagation implementation | |
| constant_propagation: | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, 256 ; Space for constant table | |
| ; Initialize constant table | |
| lea rdi, [rbp-256] | |
| mov rcx, 32 | |
| xor eax, eax | |
| rep stosq | |
| ; Scan instructions | |
| lea rsi, [instruction_list] | |
| .scan_loop: | |
| mov rax, [rsi] | |
| test rax, rax | |
| jz .done | |
| ; Check if MOV immediate | |
| cmp byte [rax], 0xB8 ; MOV reg, imm | |
| jb .not_const | |
| cmp byte [rax], 0xBF | |
| ja .not_const | |
| ; Record constant | |
| movzx rcx, byte [rax] | |
| and rcx, 7 ; Extract register | |
| mov rdx, [rax+1] ; Get immediate value | |
| mov [rbp-256+rcx*8], rdx | |
| .not_const: | |
| ; Check for uses and replace | |
| call replace_with_constants | |
| add rsi, 8 | |
| jmp .scan_loop | |
| .done: | |
| mov rsp, rbp | |
| pop rbp | |
| ret</code></pre> | |
| <h3 id="llvm-integration">16.8 LLVM Integration</h3> | |
| <h4 id="llvm-ir-to-x86-64">LLVM IR to x86-64</h4> | |
| <pre class="assembly"><code>; Example LLVM IR to x86-64 lowering | |
| ; LLVM IR: %result = add i64 %a, %b | |
| llvm_lower_add: | |
| ; Input: RDI = LLVM instruction pointer | |
| ; Output: Machine code in buffer | |
| ; Extract operands from LLVM instruction | |
| mov rsi, [rdi + llvm_inst.op1] | |
| mov rdx, [rdi + llvm_inst.op2] | |
| ; Map virtual registers to physical | |
| call get_physical_reg | |
| mov r8, rax ; Physical reg for op1 | |
| mov rdi, rdx | |
| call get_physical_reg | |
| mov r9, rax ; Physical reg for op2 | |
| ; Generate x86-64 add instruction | |
| call emit_add_instruction | |
| ret | |
| emit_add_instruction: | |
| ; Emit: add r8, r9 | |
| ; Calculate REX prefix | |
| mov al, 0x48 ; REX.W | |
| test r8, 8 | |
| jz .no_rex_r | |
| or al, 0x04 ; REX.R | |
| .no_rex_r: | |
| test r9, 8 | |
| jz .no_rex_b | |
| or al, 0x01 ; REX.B | |
| .no_rex_b: | |
| ; Emit REX prefix | |
| mov rdi, [code_ptr] | |
| stosb | |
| ; Emit opcode | |
| mov al, 0x01 | |
| stosb | |
| ; Emit ModR/M byte | |
| mov al, 0xC0 ; mod=11 (register) | |
| and r9b, 7 | |
| or al, r9b ; r/m field | |
| and r8b, 7 | |
| shl r8b, 3 | |
| or al, r8b ; reg field | |
| stosb | |
| Got it — here’s **Chapter 16: Code Generation and Compiler Backend** in the same style as our earlier chapters, building on the material from Intel’s manuals and your other ASM sources, and integrating the kind of low‑level backend workflows compilers and JIT engines use. | |
| + -- | |
| # **Chapter 16 — Code Generation and Compiler Backend** | |
| This chapter moves from hand‑tuned optimization into the compiler's domain: how to turn intermediate representation (IR) into machine code, how register allocation and instruction selection work, and how modern backends, JIT compilers, and binary translators produce efficient x86‑64 programs. | |
| + -- | |
| ## 16.1 Compilation Pipeline and Backend Scope | |
| A typical compiler architecture has: | |
| Source → Lexical Analysis → Parsing → AST → IR → Optimization → Code Generation → Assembler → Linker | |
| **Backend responsibilities:** | |
| + Translate an optimized IR into target instructions | |
| + Select efficient opcodes and addressing modes | |
| + Allocate hardware registers and handle spills | |
| + Emit correct encodings | |
| + Optionally perform late‑stage optimizations (peephole, instruction scheduling) | |
| Example — IR lowering for `(a + b) * (c - d)`: | |
| ```assembly | |
| ; Assuming: | |
| ; a → RDI, b → RSI, c → RDX, d → RCX | |
| mov rax, rdi ; a | |
| add rax, rsi ; a + b | |
| mov r10, rdx ; c | |
| sub r10, rcx ; c - d | |
| imul rax, r10 ; (a+b) * (c-d) | |
| ret</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="register-allocation-1">16.2 Register Allocation</h3> | |
| <h4 id="graph-coloring-allocation">Graph Coloring Allocation</h4> | |
| <p>Modern backends build <strong>interference graphs</strong> of | |
| variables whose live ranges overlap, then color them using available | |
| physical registers.</p> | |
| <p>Before allocation:</p> | |
| <pre class="text"><code>v1 = load mem1 | |
| v2 = load mem2 | |
| v3 = v1 + v2 | |
| v4 = load mem3 | |
| v5 = v3 * v4 | |
| store result, v5</code></pre> | |
| <p>After allocation:</p> | |
| <pre class="assembly"><code>mov rax, [mem1] ; v1 | |
| mov rbx, [mem2] ; v2 | |
| add rax, rbx ; v3 in RAX | |
| mov rcx, [mem3] ; v4 | |
| imul rax, rcx ; v5 in RAX | |
| mov [result], rax | |
| ret</code></pre> | |
| <p>When registers run out, <em>spill code</em> saves values to the | |
| stack:</p> | |
| <pre class="assembly"><code>sub rsp, 32 | |
| mov [rsp], rax ; spill value | |
| ... | |
| mov rax, [rsp] ; reload later | |
| add rsp, 32</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="instruction-selection-1">16.3 Instruction Selection</h3> | |
| <h4 id="matching-and-tiling">Matching and Tiling</h4> | |
| <p>Backends prefer single instructions over multi‑step sequences if the | |
| ISA supports it.</p> | |
| <p>Naive:</p> | |
| <pre class="assembly"><code>mov rax, [mem] | |
| add rax, rbx | |
| mov [mem], rax</code></pre> | |
| <p>Selected pattern:</p> | |
| <pre class="assembly"><code>add [mem], rbx ; fewer instructions, less register pressure</code></pre> | |
| <p>Intel’s complex addressing modes allow direct array indexing:</p> | |
| <pre class="assembly"><code>mov rcx, [rdi + rsi*8 + rdx] ; arr[i*8 + j]</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="late-stage-peephole-optimization">16.4 Late-stage (Peephole) | |
| Optimization</h3> | |
| <p>Peephole passes remove redundancy:</p> | |
| <pre class="assembly"><code>; push + pop with different regs | |
| push rax | |
| pop rbx | |
| ; → mov rbx, rax | |
| ; Constant folding | |
| mov rax, 5 | |
| add rax, 3 | |
| ; → mov rax, 8</code></pre> | |
| <p>Strength reduction:</p> | |
| <pre class="assembly"><code>imul rax, 2 ; → add rax, rax or shl rax, 1</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="jit-compilation">16.5 JIT Compilation</h3> | |
| <p>JIT compilers emit binary directly into memory at runtime and mark it | |
| executable.</p> | |
| <p>Simple constant adder generator:</p> | |
| <pre class="assembly"><code>; Generates: rax = rax + CONST; ret | |
| mov byte [buf+0], 0x48 | |
| mov byte [buf+1], 0x05 | |
| mov dword [buf+2], edi ; constant | |
| mov byte [buf+6], 0xC3 | |
| ; mprotect(buf, size, PROT_EXEC) before jumping</code></pre> | |
| <p>Templates allow patching constants:</p> | |
| <pre class="assembly"><code>cmp rdi, 0xDEADBEEF ; patched at runtime | |
| jg greater | |
| jl less | |
| xor eax, eax | |
| ret</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="dynamic-binary-translation-self-modifying-code">16.6 Dynamic | |
| Binary Translation & Self-modifying Code</h3> | |
| <p>Runtime adaptation for hot paths:</p> | |
| <pre class="assembly"><code>inc qword [branch_counter] | |
| cmp [branch_counter], 1000 | |
| jl normal_path | |
| ; Change branch to likely path | |
| mprotect code → writable | |
| mov byte [branch+offset], 0x75 ; JNZ</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="machine-code-encoding-1">16.7 Machine Code Encoding</h3> | |
| <h4 id="encoding-format">Encoding format:</h4> | |
| <p>[Prefixes] [REX] [Opcode] [ModR/M] [SIB] [Displacement] | |
| [Immediate]</p> | |
| <p>Add <code>rax, rbx</code>:</p> | |
| <pre class="assembly"><code>; REX.W Opcode ModR/M | |
| db 0x48, 0x01, 0xD8</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="building-an-assembler-backend-emitter">16.8 Building an | |
| Assembler (Backend-emitter)</h3> | |
| <p>Minimal assembler loop:</p> | |
| <pre class="assembly"><code>; parse mnemonic → get opcode | |
| ; parse operands → make ModR/M | |
| ; emit optional REX byte | |
| ; emit opcode + ModR/M + displacement/immediate</code></pre> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="backend-optimization-passes">16.9 Backend Optimization | |
| Passes</h3> | |
| <p>Late passes may include:</p> | |
| <ul> | |
| <li><p><strong>Dead code elimination</strong>: remove unreachable basic | |
| blocks</p></li> | |
| <li><p><strong>Constant propagation</strong>: replace runtime loads with | |
| compile‑time constants, folding dependent operations</p></li> | |
| <li><p><strong>Instruction scheduling</strong>: rearrange independent | |
| instructions to avoid stalls</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h3 id="llvm-backend-integration">16.10 LLVM Backend Integration</h3> | |
| <p>LLVM IR:</p> | |
| <div class="sourceCode" id="cb238"><pre | |
| class="sourceCode llvm"><code class="sourceCode llvm"><span id="cb238-1"><a href="#cb238-1" aria-hidden="true" tabindex="-1"></a><span class="fu">%res</span> = <span class="kw">add</span> <span class="dt">i64</span> <span class="fu">%a</span>, <span class="fu">%b</span></span></code></pre></div> | |
| <p>Lower to:</p> | |
| <pre class="assembly"><code>add r8, r9 ; after reg assignment</code></pre> | |
| <p>With correct REX bits for extended registers.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="summary-1">Summary</h3> | |
| <p>A compiler backend for x86‑64:</p> | |
| <ol type="1"> | |
| <li><p>Maps IR operations to machine instructions</p></li> | |
| <li><p>Allocates registers efficiently, spilling only when | |
| necessary</p></li> | |
| <li><p>Selects optimal encodings and complex addressing modes</p></li> | |
| <li><p>Performs late optimizations to reduce size and improve | |
| performance</p></li> | |
| <li><p>Emits correct binary sequences</p></li> | |
| <li><p>Optionally supports runtime generation and optimization | |
| (JIT/DBT)</p></li> | |
| </ol> | |
| <p>Understanding these mechanics allows developers to:</p> | |
| <ul> | |
| <li><p>Integrate with LLVM or GCC backends</p></li> | |
| <li><p>Write domain-specific JIT engines</p></li> | |
| <li><p>Hand-optimize IR for extreme performance</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="exercises-1">Exercises</h4> | |
| <ol type="1"> | |
| <li><p>Implement a peephole optimizer that folds constants and removes | |
| redundant moves.</p></li> | |
| <li><p>Write a register allocator using a graph coloring algorithm for a | |
| small IR.</p></li> | |
| <li><p>Implement a JIT generator for a branchless min/max function in | |
| x86‑64.</p></li> | |
| <li><p>Produce a machine code encoder for <code>mov rax, imm64</code> | |
| and explain each encoded byte.</p></li> | |
| <li><p>Modify a compiled function at runtime to optimize its hot path | |
| using <code>mprotect</code>.</p></li> | |
| </ol> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>If you want, for <strong>Chapter 17</strong>, we can dive into | |
| <em>Dynamic Linking and Relocation</em>, including ELF relocation | |
| records, PLT/GOT mechanics, and compiler-emitted relocation fixups, | |
| which ties naturally into code generation and backend work. Would you | |
| like me to proceed with that next?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="appendix-a-x86-64-instruction-reference-quick-guide">Appendix A: | |
| x86-64 Instruction Reference Quick Guide</h2> | |
| <h3 id="a.1-instruction-format-overview">A.1 Instruction Format | |
| Overview</h3> | |
| <h4 id="general-encoding-structure">General Encoding Structure</h4> | |
| <p>[Legacy Prefixes] [REX/VEX/EVEX] [Opcode] [ModR/M] [SIB] | |
| [Displacement] [Immediate]</p> | |
| <h4 id="rex-prefix-40h-4fh">REX Prefix (40h-4Fh)</h4> | |
| <p>0100 WRXB W: 64-bit operand size R: Extension of ModR/M reg field X: | |
| Extension of SIB index field B: Extension of ModR/M r/m field</p> | |
| <h4 id="modrm-byte">ModR/M Byte</h4> | |
| <p>[7:6] MOD - Addressing mode [5:3] REG - Register/Opcode [2:0] R/M - | |
| Register/Memory</p> | |
| <h3 id="a.2-data-movement-instructions">A.2 Data Movement | |
| Instructions</h3> | |
| <h4 id="basic-moves">Basic Moves</h4> | |
| <pre class="assembly"><code>MOV dst, src ; Move data | |
| MOVZX dst, src ; Move with zero extend | |
| MOVSX dst, src ; Move with sign extend | |
| MOVSXD r64, r/m32 ; Sign extend doubleword to quadword | |
| LEA reg, mem ; Load effective address | |
| XCHG op1, op2 ; Exchange values | |
| BSWAP reg ; Byte swap</code></pre> | |
| <h4 id="stack-operations">Stack Operations</h4> | |
| <pre class="assembly"><code>PUSH op ; Push to stack | |
| POP op ; Pop from stack | |
| PUSHF/PUSHFQ ; Push FLAGS/RFLAGS | |
| POPF/POPFQ ; Pop FLAGS/RFLAGS</code></pre> | |
| <h4 id="conditional-moves-cmovcc">Conditional Moves (CMOVcc)</h4> | |
| <pre class="assembly"><code>CMOVE/CMOVZ ; Move if equal/zero (ZF=1) | |
| CMOVNE/CMOVNZ ; Move if not equal/not zero (ZF=0) | |
| CMOVL/CMOVNGE ; Move if less (SF≠OF) | |
| CMOVLE/CMOVNG ; Move if less or equal (ZF=1 or SF≠OF) | |
| CMOVG/CMOVNLE ; Move if greater (ZF=0 and SF=OF) | |
| CMOVGE/CMOVNL ; Move if greater or equal (SF=OF) | |
| CMOVA/CMOVNBE ; Move if above (CF=0 and ZF=0) | |
| CMOVAE/CMOVNB/CMOVNC ; Move if above or equal (CF=0) | |
| CMOVB/CMOVNAE/CMOVC ; Move if below (CF=1) | |
| CMOVBE/CMOVNA ; Move if below or equal (CF=1 or ZF=1)</code></pre> | |
| <h3 id="a.3-arithmetic-instructions">A.3 Arithmetic Instructions</h3> | |
| <h4 id="integer-arithmetic-2">Integer Arithmetic</h4> | |
| <pre class="assembly"><code>ADD dst, src ; Addition | |
| ADC dst, src ; Add with carry | |
| SUB dst, src ; Subtraction | |
| SBB dst, src ; Subtract with borrow | |
| INC op ; Increment | |
| DEC op ; Decrement | |
| NEG op ; Two's complement negation | |
| CMP op1, op2 ; Compare (sets flags) | |
| MUL src ; Unsigned multiply (RDX:RAX = RAX * src) | |
| IMUL src ; Signed multiply | |
| IMUL dst, src ; Signed multiply (dst = dst * src) | |
| IMUL dst, src, imm ; Signed multiply (dst = src * imm) | |
| DIV src ; Unsigned divide (RAX = RDX:RAX / src) | |
| IDIV src ; Signed divide</code></pre> | |
| <h4 id="bcd-and-ascii-adjust-legacy">BCD and ASCII Adjust (Legacy)</h4> | |
| <pre class="assembly"><code>AAA, AAS, AAD, AAM ; ASCII adjust (not in 64-bit mode) | |
| DAA, DAS ; Decimal adjust (not in 64-bit mode)</code></pre> | |
| <h3 id="a.4-logical-instructions">A.4 Logical Instructions</h3> | |
| <pre class="assembly"><code>AND dst, src ; Logical AND | |
| OR dst, src ; Logical OR | |
| XOR dst, src ; Logical XOR | |
| NOT op ; One's complement | |
| TEST op1, op2 ; Logical compare (AND without storing)</code></pre> | |
| <h3 id="a.5-shift-and-rotate-instructions">A.5 Shift and Rotate | |
| Instructions</h3> | |
| <pre class="assembly"><code>SHL/SAL op, count ; Shift left | |
| SHR op, count ; Logical shift right | |
| SAR op, count ; Arithmetic shift right | |
| ROL op, count ; Rotate left | |
| ROR op, count ; Rotate right | |
| RCL op, count ; Rotate left through carry | |
| RCR op, count ; Rotate right through carry | |
| SHLD dst, src, count ; Double precision shift left | |
| SHRD dst, src, count ; Double precision shift right</code></pre> | |
| <h3 id="a.6-bit-manipulation-instructions">A.6 Bit Manipulation | |
| Instructions</h3> | |
| <pre class="assembly"><code>BT op, bit ; Bit test | |
| BTS op, bit ; Bit test and set | |
| BTR op, bit ; Bit test and reset | |
| BTC op, bit ; Bit test and complement | |
| BSF dst, src ; Bit scan forward | |
| BSR dst, src ; Bit scan reverse | |
| LZCNT dst, src ; Leading zero count (BMI) | |
| TZCNT dst, src ; Trailing zero count (BMI) | |
| POPCNT dst, src ; Population count | |
| ; BMI Instructions | |
| ANDN dst, src1, src2 ; Logical AND NOT | |
| BEXTR dst, src, ctrl ; Bit field extract | |
| BLSI dst, src ; Extract lowest set bit | |
| BLSMSK dst, src ; Mask up to lowest set bit | |
| BLSR dst, src ; Reset lowest set bit</code></pre> | |
| <h3 id="a.7-control-transfer-instructions">A.7 Control Transfer | |
| Instructions</h3> | |
| <h4 id="unconditional-jumps-1">Unconditional Jumps</h4> | |
| <pre class="assembly"><code>JMP target ; Near/far jump | |
| CALL target ; Call procedure | |
| RET [imm16] ; Return from procedure</code></pre> | |
| <h4 id="conditional-jumps-jcc">Conditional Jumps (Jcc)</h4> | |
| <pre class="assembly"><code>JE/JZ ; Jump if equal/zero (ZF=1) | |
| JNE/JNZ ; Jump if not equal/not zero (ZF=0) | |
| JL/JNGE ; Jump if less (SF≠OF) | |
| JLE/JNG ; Jump if less or equal (ZF=1 or SF≠OF) | |
| JG/JNLE ; Jump if greater (ZF=0 and SF=OF) | |
| JGE/JNL ; Jump if greater or equal (SF=OF) | |
| JA/JNBE ; Jump if above (CF=0 and ZF=0) | |
| JAE/JNB/JNC ; Jump if above or equal (CF=0) | |
| JB/JNAE/JC ; Jump if below (CF=1) | |
| JBE/JNA ; Jump if below or equal (CF=1 or ZF=1) | |
| JO, JNO ; Jump if overflow/not overflow | |
| JS, JNS ; Jump if sign/not sign | |
| JP/JPE, JNP/JPO ; Jump if parity even/odd</code></pre> | |
| <h4 id="loop-instructions-1">Loop Instructions</h4> | |
| <pre class="assembly"><code>LOOP target ; Decrement RCX and jump if not zero | |
| LOOPE/LOOPZ ; Loop while equal/zero | |
| LOOPNE/LOOPNZ ; Loop while not equal/not zero | |
| JRCXZ target ; Jump if RCX is zero</code></pre> | |
| <h3 id="a.8-string-instructions">A.8 String Instructions</h3> | |
| <pre class="assembly"><code>MOVS[B/W/D/Q] ; Move string | |
| CMPS[B/W/D/Q] ; Compare string | |
| SCAS[B/W/D/Q] ; Scan string | |
| LODS[B/W/D/Q] ; Load string | |
| STOS[B/W/D/Q] ; Store string | |
| REP ; Repeat while RCX != 0 | |
| REPE/REPZ ; Repeat while equal/zero | |
| REPNE/REPNZ ; Repeat while not equal/not zero</code></pre> | |
| <h3 id="a.9-flag-control-instructions">A.9 Flag Control | |
| Instructions</h3> | |
| <pre class="assembly"><code>CLC, STC, CMC ; Clear/Set/Complement carry | |
| CLD, STD ; Clear/Set direction flag | |
| CLI, STI ; Clear/Set interrupt flag (privileged) | |
| LAHF, SAHF ; Load/Store AH from/to FLAGS</code></pre> | |
| <h3 id="a.10-system-instructions">A.10 System Instructions</h3> | |
| <pre class="assembly"><code>RDMSR, WRMSR ; Read/Write Model-Specific Register | |
| RDTSC, RDTSCP ; Read Time-Stamp Counter | |
| CPUID ; CPU Identification | |
| RDPMC ; Read Performance Counter | |
| XGETBV, XSETBV ; Get/Set Extended Control Register | |
| ; Privileged Instructions | |
| LGDT, SGDT ; Load/Store Global Descriptor Table | |
| LIDT, SIDT ; Load/Store Interrupt Descriptor Table | |
| LLDT, SLDT ; Load/Store Local Descriptor Table | |
| LTR, STR ; Load/Store Task Register</code></pre> | |
| <h3 id="a.11-simd-instructions-sseavx">A.11 SIMD Instructions | |
| (SSE/AVX)</h3> | |
| <h4 id="data-movement">Data Movement</h4> | |
| <pre class="assembly"><code>MOVAPS/MOVUPS ; Move aligned/unaligned packed single | |
| MOVAPD/MOVUPD ; Move aligned/unaligned packed double | |
| MOVDQA/MOVDQU ; Move aligned/unaligned integer | |
| MOVSS/MOVSD ; Move scalar single/double | |
| MOVHPS/MOVLPS ; Move high/low packed single | |
| MOVHPD/MOVLPD ; Move high/low packed double</code></pre> | |
| <h4 id="arithmetic-packed">Arithmetic (Packed)</h4> | |
| <pre class="assembly"><code>ADDPS/ADDPD ; Add packed single/double | |
| SUBPS/SUBPD ; Subtract packed single/double | |
| MULPS/MULPD ; Multiply packed single/double | |
| DIVPS/DIVPD ; Divide packed single/double | |
| SQRTPS/SQRTPD ; Square root packed single/double | |
| MAXPS/MAXPD ; Maximum packed single/double | |
| MINPS/MINPD ; Minimum packed single/double</code></pre> | |
| <h4 id="logical">Logical</h4> | |
| <pre class="assembly"><code>ANDPS/ANDPD ; Bitwise AND | |
| ORPS/ORPD ; Bitwise OR | |
| XORPS/XORPD ; Bitwise XOR | |
| ANDNPS/ANDNPD ; Bitwise AND NOT</code></pre> | |
| <h4 id="comparison">Comparison</h4> | |
| <pre class="assembly"><code>CMPPS/CMPPD ; Compare packed | |
| COMISS/COMISD ; Compare scalar (sets EFLAGS) | |
| UCOMISS/UCOMISD ; Unordered compare scalar</code></pre> | |
| <h4 id="shufflepermute">Shuffle/Permute</h4> | |
| <pre class="assembly"><code>SHUFPS/SHUFPD ; Shuffle packed | |
| UNPCKHPS/UNPCKLPS ; Unpack high/low single | |
| UNPCKHPD/UNPCKLPD ; Unpack high/low double</code></pre> | |
| <h3 id="a.12-avxavx2-instructions">A.12 AVX/AVX2 Instructions</h3> | |
| <h4 id="three-operand-form">Three-Operand Form</h4> | |
| <pre class="assembly"><code>VADDPS dst, src1, src2 ; dst = src1 + src2 | |
| VMULPS dst, src1, src2 ; dst = src1 * src2 | |
| VSUBPS dst, src1, src2 ; dst = src1 - src2</code></pre> | |
| <h4 id="fma-fused-multiply-add">FMA (Fused Multiply-Add)</h4> | |
| <pre class="assembly"><code>VFMADD132PS/PD ; dst = dst * src2 + src3 | |
| VFMADD213PS/PD ; dst = src2 * dst + src3 | |
| VFMADD231PS/PD ; dst = src2 * src3 + dst | |
| VFMSUB###PS/PD ; Variants with subtraction | |
| VFNMADD###PS/PD ; Variants with negation</code></pre> | |
| <h4 id="gatherscatter-avx2avx-512">Gather/Scatter (AVX2/AVX-512)</h4> | |
| <pre class="assembly"><code>VGATHERDPS/VGATHERQPS ; Gather single precision | |
| VGATHERDPD/VGATHERQPD ; Gather double precision | |
| VPGATHERDD/VPGATHERQD ; Gather doublewords | |
| VPGATHERDQ/VPGATHERQQ ; Gather quadwords</code></pre> | |
| <h3 id="a.13-avx-512-instructions">A.13 AVX-512 Instructions</h3> | |
| <h4 id="mask-operations">Mask Operations</h4> | |
| <pre class="assembly"><code>KMOVB/KMOVW/KMOVD/KMOVQ ; Move mask register | |
| KANDW/KANDD/KANDQ ; AND mask registers | |
| KORW/KORD/KORQ ; OR mask registers | |
| KXORW/KXORD/KXORQ ; XOR mask registers | |
| KNOTW/KNOTD/KNOTQ ; NOT mask register | |
| KORTESTW/KORTESTD ; OR and test mask</code></pre> | |
| <h4 id="masked-operations">Masked Operations</h4> | |
| <pre class="assembly"><code>VADDPS zmm1{k1}, zmm2, zmm3 ; Masked addition | |
| VMOVAPS zmm1{k1}{z}, [mem] ; Masked move with zeroing</code></pre> | |
| <h4 id="special-avx-512-instructions">Special AVX-512 Instructions</h4> | |
| <pre class="assembly"><code>VCOMPRESS## ; Compress packed data | |
| VEXPAND## ; Expand packed data | |
| VPERMI2## ; Full permute | |
| VPERMT2## ; Full permute with overwrite | |
| VCONFLICT## ; Detect conflicts</code></pre> | |
| <h3 id="a.14-transactional-memory-tsx">A.14 Transactional Memory | |
| (TSX)</h3> | |
| <pre class="assembly"><code>XBEGIN target ; Begin transaction | |
| XEND ; End transaction | |
| XABORT imm8 ; Abort transaction | |
| XTEST ; Test if in transaction</code></pre> | |
| <h3 id="a.15-security-extensions">A.15 Security Extensions</h3> | |
| <h4 id="intel-cet-control-flow-enforcement">Intel CET (Control-flow | |
| Enforcement)</h4> | |
| <pre class="assembly"><code>ENDBR32/ENDBR64 ; End branch markers | |
| INCSSPD/INCSSPQ ; Increment shadow stack pointer | |
| RDSSPD/RDSSPQ ; Read shadow stack pointer | |
| SAVEPREVSSP ; Save previous shadow stack pointer | |
| RSTORSSP ; Restore shadow stack pointer</code></pre> | |
| <h4 id="intel-sgx">Intel SGX</h4> | |
| <pre class="assembly"><code>ENCLS ; SGX Supervisor instructions | |
| ENCLU ; SGX User instructions | |
| ENCLV ; SGX Virtualization instructions</code></pre> | |
| <h3 id="a.16-common-instruction-patterns">A.16 Common Instruction | |
| Patterns</h3> | |
| <h4 id="function-prologueepilogue">Function Prologue/Epilogue</h4> | |
| <pre class="assembly"><code>; Prologue | |
| push rbp | |
| mov rbp, rsp | |
| sub rsp, N ; Allocate stack space | |
| ; Epilogue | |
| mov rsp, rbp | |
| pop rbp | |
| ret</code></pre> | |
| <h4 id="system-v-amd64-abi-registers">System V AMD64 ABI Registers</h4> | |
| <p>Arguments: RDI, RSI, RDX, RCX, R8, R9 Return: RAX (RDX:RAX for | |
| 128-bit) Preserved: RBX, RBP, R12-R15 Scratch: RAX, RCX, RDX, RSI, RDI, | |
| R8-R11</p> | |
| <h4 id="windows-x64-abi-registers">Windows x64 ABI Registers</h4> | |
| <p>Arguments: RCX, RDX, R8, R9 Return: RAX Preserved: RBX, RBP, RDI, | |
| RSI, RSP, R12-R15 Scratch: RAX, RCX, RDX, R8-R11</p> | |
| <h3 id="a.17-optimization-guidelines">A.17 Optimization Guidelines</h3> | |
| <h4 id="alignment">Alignment</h4> | |
| <ul> | |
| <li><p>Functions: 16-byte boundary</p></li> | |
| <li><p>Loops: 16 or 32-byte boundary</p></li> | |
| <li><p>Data: Natural alignment (4-byte for DWORD, 8-byte for | |
| QWORD)</p></li> | |
| <li><p>SIMD data: 16-byte (SSE), 32-byte (AVX), 64-byte | |
| (AVX-512)</p></li> | |
| </ul> | |
| <h4 id="instruction-selection-2">Instruction Selection</h4> | |
| <ul> | |
| <li><p>Prefer <code>XOR reg,reg</code> over <code>MOV reg,0</code> for | |
| zeroing</p></li> | |
| <li><p>Use <code>TEST</code> instead of <code>CMP</code> with 0</p></li> | |
| <li><p>Prefer <code>LEA</code> for complex address arithmetic</p></li> | |
| <li><p>Use conditional moves to avoid branches</p></li> | |
| </ul> | |
| <h4 id="pipeline-optimization">Pipeline Optimization</h4> | |
| <ul> | |
| <li><p>Avoid partial register updates</p></li> | |
| <li><p>Minimize dependency chains</p></li> | |
| <li><p>Unroll small loops</p></li> | |
| <li><p>Align branch targets</p></li> | |
| </ul> | |
| <h3 id="summary-2">Summary</h3> | |
| <p>This quick reference covers the most commonly used x86-64 | |
| instructions. For complete details including:</p> | |
| <ul> | |
| <li><p>Exact encoding formats</p></li> | |
| <li><p>All flag effects</p></li> | |
| <li><p>Timing information</p></li> | |
| <li><p>Exception conditions</p></li> | |
| <li><p>Detailed operand forms</p></li> | |
| </ul> | |
| <p>Consult the Intel® 64 and IA-32 Architectures Software Developer’s | |
| Manual volumes, particularly Volume 2 (Instruction Set Reference).</p> | |
| <p>Key patterns to remember:</p> | |
| <ol type="1"> | |
| <li><p>Most instructions support multiple operand sizes | |
| (8/16/32/64-bit)</p></li> | |
| <li><p>Memory operands use ModR/M an Got it — here’s a dense technical | |
| quick reference for the instructions and extensions you’ve been | |
| summarizing, drawn together across Intel’s Vol. 2 ISA and the SSE | |
| through AVX‑512 chapters you described:</p></li> | |
| </ol> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="instruction-extension-quick-map"><strong>Instruction & | |
| Extension Quick Map</strong></h3> | |
| <h4 id="encoding-basics"><strong>Encoding Basics</strong></h4> | |
| <ul> | |
| <li><p><strong>REX</strong>: <code>0100WRXB</code> — W(64‑bit), R(ext | |
| ModR/M reg), X(ext SIB index), B(ext ModR/M r/m)</p></li> | |
| <li><p><strong>ModR/M</strong>: MOD (addr mode) | REG (reg/opcode) | R/M | |
| (reg/mem)</p></li> | |
| <li><p><strong>VEX/EVEX</strong>: enable 3‑operand forms, wider regs, | |
| masking, embedded rounding</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="scalar-and-general-purpose-ops"><strong>Scalar and General | |
| Purpose Ops</strong></h4> | |
| <ul> | |
| <li><p><strong>MOV</strong>/<strong>MOVSX</strong>/<strong>MOVZX</strong> | |
| – transfer and extend</p></li> | |
| <li><p><strong>LEA</strong> – addr calc</p></li> | |
| <li><p>Stack: | |
| <code>PUSH</code>/<code>POP</code>/<code>PUSHFQ</code>/<code>POPFQ</code></p></li> | |
| <li><p>Conditional moves: <code>CMOVcc</code> variants</p></li> | |
| <li><p>Arithmetic: <code>ADD</code>, <code>SUB</code>, <code>ADC</code>, | |
| <code>SBB</code>, <code>INC</code>, <code>DEC</code>, <code>NEG</code>, | |
| <code>CMP</code></p></li> | |
| <li><p>Multiply/divide: <code>MUL</code>, <code>IMUL</code>, | |
| <code>DIV</code>, <code>IDIV</code></p></li> | |
| <li><p>Logic: <code>AND</code>, <code>OR</code>, <code>XOR</code>, | |
| <code>NOT</code>, <code>TEST</code></p></li> | |
| <li><p>Shifts/rotates: <code>SHL</code>, <code>SHR</code>, | |
| <code>SAR</code>, <code>ROL</code>, <code>ROR</code>, <code>SHLD</code>, | |
| <code>SHRD</code></p></li> | |
| <li><p>Bit ops: <code>BT</code>, <code>BTS</code>, <code>BTR</code>, | |
| <code>BTC</code>, <code>BSF</code>, <code>BSR</code>, | |
| <code>POPCNT</code></p></li> | |
| <li><p>Control transfer: <code>Jcc</code>, <code>CALL</code>, | |
| <code>RET</code>, <code>LOOPcc</code></p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="sse3-ssse3-sse4-highlights"><strong>SSE3 / SSSE3 / SSE4 | |
| Highlights</strong></h4> | |
| <ul> | |
| <li><p><strong>Horizontal arithmetic</strong>: <code>HADDPS/PD</code>, | |
| <code>HSUBPS/PD</code>, <code>ADDSUBPS/PD</code></p></li> | |
| <li><p><strong>Data move/shuffle</strong>: <code>MOVSLDUP</code>, | |
| <code>MOVSHDUP</code>, <code>MOVDDUP</code>, <code>LDDQU</code></p></li> | |
| <li><p><strong>Absolute values</strong>: <code>PABSB/W/D</code></p></li> | |
| <li><p><strong>Sign ops</strong>: <code>PSIGNB/W/D</code></p></li> | |
| <li><p><strong>Horiz add/sub saturating</strong>: | |
| <code>PHADDW/D/SW</code>, <code>PHSUBW/D/SW</code></p></li> | |
| <li><p><strong>Mult/add</strong>: <code>PMADDUBSW</code>, | |
| <code>PMULHRSW</code></p></li> | |
| <li><p><strong>Shuffles</strong>: <code>PSHUFB</code></p></li> | |
| <li><p><strong>Align concat</strong>: <code>PALIGNR</code></p></li> | |
| <li><p><strong>Blend</strong>: <code>BLENDPS/PD</code>, | |
| <code>BLENDVPS/PD</code>, <code>PBLENDVB</code>, | |
| <code>PBLENDW</code></p></li> | |
| <li><p><strong>Dot product</strong>: <code>DPPS</code>, | |
| <code>DPPD</code></p></li> | |
| <li><p><strong>Rounding</strong>: <code>ROUNDPS/PD/SS/SD</code></p></li> | |
| <li><p><strong>Min/max int</strong>: <code>PMINSB/MAXSB</code> + | |
| word/dword/qword variants</p></li> | |
| <li><p><strong>String ops</strong>: <code>PCMPxSTRx</code></p></li> | |
| <li><p><strong>CRC</strong>: <code>CRC32</code></p></li> | |
| <li><p><strong>POPCNT</strong> – population count</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="avx-avx2"><strong>AVX / AVX2</strong></h4> | |
| <ul> | |
| <li><p><strong>YMM regs</strong> – 256‑bit, XMM alias</p></li> | |
| <li><p><strong>VEX</strong> – 3‑operand, non‑destructive</p></li> | |
| <li><p><strong>State mgmt</strong>: <code>vzeroupper</code>, | |
| <code>XSAVE</code>/<code>XRSTOR</code>, | |
| <code>XGETBV</code>/<code>XSETBV</code></p></li> | |
| <li><p><strong>FP ops</strong>: <code>VADDPS/PD</code>, | |
| <code>VMULPS/PD</code>, <code>VDIVPS/PD</code>, | |
| <code>VSQRTPS/PD</code></p></li> | |
| <li><p><strong>FMA3</strong>: <code>VFMADD*</code>, | |
| <code>VFMSUB*</code>, <code>VFNMADD*</code>, | |
| <code>VFNMSUB*</code></p></li> | |
| <li><p><strong>Cmp/mask</strong>: <code>VCMP*</code>, | |
| <code>VBLENDV*</code></p></li> | |
| <li><p><strong>Broadcast</strong>: <code>VBROADCASTSS/SD</code></p></li> | |
| <li><p><strong>Permute/shuffle</strong>: <code>VPERM2F128</code>, | |
| <code>VPERMILPS/PD</code></p></li> | |
| <li><p><strong>Blend imm/var</strong>: <code>VBLENDPS/PD</code>, | |
| <code>VBLENDVPS/PD</code></p></li> | |
| <li><p><strong>Integer ops 256‑bit</strong>: add/sub/mul/shift | |
| (<code>VPADD*</code>, <code>VPSUB*</code>, <code>VPMUL*</code>, | |
| <code>VPSLLV*</code>, <code>VPSRAV*</code>)</p></li> | |
| <li><p><strong>Gather</strong>: <code>VGATHER*</code>, | |
| <code>VPGATHER*</code></p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="avx512"><strong>AVX‑512</strong></h4> | |
| <ul> | |
| <li><p><strong>ZMM regs</strong> – 512‑bit, k0‑k7 masks</p></li> | |
| <li><p><strong>EVEX</strong> – 512‑bit, predicated exec, embedded | |
| broadcast/rounding</p></li> | |
| <li><p><strong>Foundation</strong>: 512‑bit FP/INT ops, | |
| <code>VREDUCE*</code>, <code>VRCP14*</code>, | |
| <code>VRSQRT14*</code></p></li> | |
| <li><p><strong>Permute</strong>: <code>VPERMI2*</code>, | |
| <code>VPERMT2*</code>, <code>VPERM*</code></p></li> | |
| <li><p><strong>Compress/expand</strong>: <code>VCOMPRESS*</code>, | |
| <code>VEXPAND*</code></p></li> | |
| <li><p><strong>Ternary logic</strong>: | |
| <code>VPTERNLOGD/Q</code></p></li> | |
| <li><p><strong>Scatter</strong>: <code>VPSCATTER*</code></p></li> | |
| <li><p><strong>BW/DQ/VNNI/IFMA</strong>: byte/word/dword/qword | |
| extensions, DL‑oriented dot products, integer FMA</p></li> | |
| <li><p><strong>Mask forms</strong>: <code>{k}{z}</code> zeroing, | |
| merging</p></li> | |
| <li><p><strong>Conflict detect</strong>: | |
| <code>VPCONFLICT*</code></p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h4 id="systemarch"><strong>System/Arch</strong></h4> | |
| <ul> | |
| <li><p>Rings 0–3, GDT/IDT/TSS (<code>LGDT</code>, <code>LIDT</code>, | |
| <code>LTR</code>), control regs (CR0/3/4)</p></li> | |
| <li><p>MSRs (<code>RDMSR</code>/<code>WRMSR</code>), syscall/sysret, | |
| sysenter/sysexit</p></li> | |
| <li><p>Paging: 4‑/5‑level, PTE flags (NX, G/U, R/W, P)</p></li> | |
| <li><p>TLB ops: <code>INVVPID</code>, <code>INVPCID</code></p></li> | |
| <li><p>Prot keys: <code>WRPKRU</code>, <code>RDPKRU</code></p></li> | |
| <li><p>MTRRs – cache type controls</p></li> | |
| <li><p>SMEP/SMAP – user/kernel exec/data prevent</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <p>This compresses your Chapter 8–12 progression into a lookup | |
| table‑style reference while tying it to encoding details and systemic | |
| ops. If you’d like, I can expand <strong>each extension (SSE3 through | |
| AVX‑512)</strong> into an “intrinsics map” so you can jump between Intel | |
| manual mnemonics and their C/C++ intrinsic function names — handy for | |
| hybrid assembly/C optimization work. Would you like me to generate that | |
| mapping next?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="appendix-b-system-v-amd64-abi-summary">Appendix B: System V | |
| AMD64 ABI Summary</h2> | |
| <h3 id="b.1-register-usage-conventions">B.1 Register Usage | |
| Conventions</h3> | |
| <h4 id="general-purpose-registers-1">General Purpose Registers</h4> | |
| <table> | |
| <colgroup> | |
| <col style="width: 28%" /> | |
| <col style="width: 20%" /> | |
| <col style="width: 31%" /> | |
| <col style="width: 20%" /> | |
| </colgroup> | |
| <thead> | |
| <tr> | |
| <th>Register</th> | |
| <th>Usage</th> | |
| <th>Preserved</th> | |
| <th>Notes</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>RAX</strong></td> | |
| <td>Return value, 1st return register</td> | |
| <td>No</td> | |
| <td>Also used for syscall number</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RBX</strong></td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| <td>Must be preserved across calls</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RCX</strong></td> | |
| <td>4th argument</td> | |
| <td>No</td> | |
| <td>Used in syscalls for return address</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RDX</strong></td> | |
| <td>3rd argument, 2nd return register</td> | |
| <td>No</td> | |
| <td>High 64 bits of 128-bit return</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RSI</strong></td> | |
| <td>2nd argument</td> | |
| <td>No</td> | |
| <td>Source index for string ops</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RDI</strong></td> | |
| <td>1st argument</td> | |
| <td>No</td> | |
| <td>Destination index for string ops</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RBP</strong></td> | |
| <td>Frame pointer (optional)</td> | |
| <td>Yes</td> | |
| <td>Can be used as general register with | |
| <code>-fomit-frame-pointer</code></td> | |
| </tr> | |
| <tr> | |
| <td><strong>RSP</strong></td> | |
| <td>Stack pointer</td> | |
| <td>Yes</td> | |
| <td>Must be 16-byte aligned before CALL</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R8</strong></td> | |
| <td>5th argument</td> | |
| <td>No</td> | |
| <td>Additional scratch register</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R9</strong></td> | |
| <td>6th argument</td> | |
| <td>No</td> | |
| <td>Additional scratch register</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R10</strong></td> | |
| <td>Scratch, static chain pointer</td> | |
| <td>No</td> | |
| <td>Used for nested functions</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R11</strong></td> | |
| <td>Scratch</td> | |
| <td>No</td> | |
| <td>Used by syscall/sysret</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R12</strong></td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| <td>Must be preserved</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R13</strong></td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| <td>Must be preserved</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R14</strong></td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| <td>Must be preserved</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R15</strong></td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| <td>Must be preserved</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h4 id="floating-point-registers">Floating-Point Registers</h4> | |
| <table> | |
| <colgroup> | |
| <col style="width: 28%" /> | |
| <col style="width: 20%" /> | |
| <col style="width: 31%" /> | |
| <col style="width: 20%" /> | |
| </colgroup> | |
| <thead> | |
| <tr> | |
| <th>Register</th> | |
| <th>Usage</th> | |
| <th>Preserved</th> | |
| <th>Notes</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>XMM0</strong></td> | |
| <td>1st FP arg, FP return value</td> | |
| <td>No</td> | |
| <td>Also used for complex returns</td> | |
| </tr> | |
| <tr> | |
| <td><strong>XMM1</strong></td> | |
| <td>2nd FP arg, 2nd FP return</td> | |
| <td>No</td> | |
| <td>Imaginary part of complex</td> | |
| </tr> | |
| <tr> | |
| <td><strong>XMM2-XMM7</strong></td> | |
| <td>3rd-8th FP arguments</td> | |
| <td>No</td> | |
| <td>Scratch registers</td> | |
| </tr> | |
| <tr> | |
| <td><strong>XMM8-XMM15</strong></td> | |
| <td>Scratch</td> | |
| <td>No</td> | |
| <td>Additional temporaries</td> | |
| </tr> | |
| <tr> | |
| <td><strong>YMM0-YMM15</strong></td> | |
| <td>AVX extension of XMM</td> | |
| <td>No</td> | |
| <td>Upper 128 bits not preserved</td> | |
| </tr> | |
| <tr> | |
| <td><strong>ZMM0-ZMM31</strong></td> | |
| <td>AVX-512 extension</td> | |
| <td>No</td> | |
| <td>Upper bits not preserved</td> | |
| </tr> | |
| <tr> | |
| <td><strong>K0-K7</strong></td> | |
| <td>AVX-512 mask registers</td> | |
| <td>No</td> | |
| <td>K0 usually means no masking</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h4 id="special-registers">Special Registers</h4> | |
| <table> | |
| <colgroup> | |
| <col style="width: 41%" /> | |
| <col style="width: 29%" /> | |
| <col style="width: 29%" /> | |
| </colgroup> | |
| <thead> | |
| <tr> | |
| <th>Register</th> | |
| <th>Usage</th> | |
| <th>Notes</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>RFLAGS</strong></td> | |
| <td>Status flags</td> | |
| <td>DF must be clear on entry/exit</td> | |
| </tr> | |
| <tr> | |
| <td><strong>MXCSR</strong></td> | |
| <td>SSE control/status</td> | |
| <td>Must preserve rounding mode, exception masks</td> | |
| </tr> | |
| <tr> | |
| <td><strong>x87 FPU</strong></td> | |
| <td>Legacy floating-point</td> | |
| <td>Not preserved, should be empty on entry</td> | |
| </tr> | |
| <tr> | |
| <td><strong>FS</strong></td> | |
| <td>Thread-local storage</td> | |
| <td>Reserved for system use</td> | |
| </tr> | |
| <tr> | |
| <td><strong>GS</strong></td> | |
| <td>Thread-local storage (kernel)</td> | |
| <td>Reserved for system use</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h3 id="b.2-function-calling-convention">B.2 Function Calling | |
| Convention</h3> | |
| <h4 id="argument-passing">Argument Passing</h4> | |
| <div class="sourceCode" id="cb269"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb269-1"><a href="#cb269-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Function prototype</span></span> | |
| <span id="cb269-2"><a href="#cb269-2" aria-hidden="true" tabindex="-1"></a><span class="dt">long</span> func<span class="op">(</span><span class="dt">int</span> a<span class="op">,</span> <span class="dt">long</span> b<span class="op">,</span> <span class="dt">char</span> <span class="op">*</span>c<span class="op">,</span> <span class="dt">double</span> d<span class="op">,</span> <span class="dt">float</span> e<span class="op">,</span> <span class="dt">short</span> f<span class="op">,</span> </span> | |
| <span id="cb269-3"><a href="#cb269-3" aria-hidden="true" tabindex="-1"></a> <span class="dt">long</span> g<span class="op">,</span> <span class="dt">double</span> h<span class="op">,</span> <span class="dt">int</span> i<span class="op">);</span></span> | |
| <span id="cb269-4"><a href="#cb269-4" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb269-5"><a href="#cb269-5" aria-hidden="true" tabindex="-1"></a><span class="co">// Register assignments:</span></span> | |
| <span id="cb269-6"><a href="#cb269-6" aria-hidden="true" tabindex="-1"></a><span class="co">// RDI = a (int → sign-extended to 64-bit)</span></span> | |
| <span id="cb269-7"><a href="#cb269-7" aria-hidden="true" tabindex="-1"></a><span class="co">// RSI = b (long)</span></span> | |
| <span id="cb269-8"><a href="#cb269-8" aria-hidden="true" tabindex="-1"></a><span class="co">// RDX = c (pointer)</span></span> | |
| <span id="cb269-9"><a href="#cb269-9" aria-hidden="true" tabindex="-1"></a><span class="co">// XMM0 = d (double)</span></span> | |
| <span id="cb269-10"><a href="#cb269-10" aria-hidden="true" tabindex="-1"></a><span class="co">// XMM1 = e (float)</span></span> | |
| <span id="cb269-11"><a href="#cb269-11" aria-hidden="true" tabindex="-1"></a><span class="co">// RCX = f (short → sign-extended)</span></span> | |
| <span id="cb269-12"><a href="#cb269-12" aria-hidden="true" tabindex="-1"></a><span class="co">// R8 = g (long)</span></span> | |
| <span id="cb269-13"><a href="#cb269-13" aria-hidden="true" tabindex="-1"></a><span class="co">// XMM2 = h (double)</span></span> | |
| <span id="cb269-14"><a href="#cb269-14" aria-hidden="true" tabindex="-1"></a><span class="co">// R9 = i (int → sign-extended)</span></span></code></pre></div> | |
| <h4 id="classification-rules">Classification Rules</h4> | |
| <ol type="1"> | |
| <li><p><strong>INTEGER class</strong>: Integers, pointers (≤8 | |
| bytes)</p></li> | |
| <li><p><strong>SSE class</strong>: float, double, __m64, __m128</p></li> | |
| <li><p><strong>SSEUP class</strong>: Second half of __m128 in | |
| structs</p></li> | |
| <li><p><strong>X87/X87UP class</strong>: long double, | |
| __float128</p></li> | |
| <li><p><strong>MEMORY class</strong>: Aggregates >16 bytes or | |
| misaligned</p></li> | |
| <li><p><strong>NO_CLASS</strong>: Padding bytes</p></li> | |
| </ol> | |
| <h4 id="aggregate-structunion-passing">Aggregate (Struct/Union) | |
| Passing</h4> | |
| <div class="sourceCode" id="cb270"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb270-1"><a href="#cb270-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Passed in registers (≤16 bytes, proper alignment)</span></span> | |
| <span id="cb270-2"><a href="#cb270-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Point <span class="op">{</span> <span class="dt">double</span> x<span class="op">,</span> y<span class="op">;</span> <span class="op">};</span> <span class="co">// XMM0 (x), XMM1 (y)</span></span> | |
| <span id="cb270-3"><a href="#cb270-3" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Small <span class="op">{</span> <span class="dt">int</span> a<span class="op">;</span> <span class="dt">char</span> b<span class="op">;</span> <span class="op">};</span> <span class="co">// RDI (packed in single register)</span></span> | |
| <span id="cb270-4"><a href="#cb270-4" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb270-5"><a href="#cb270-5" aria-hidden="true" tabindex="-1"></a><span class="co">// Passed by reference (>16 bytes or complex)</span></span> | |
| <span id="cb270-6"><a href="#cb270-6" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Large <span class="op">{</span> <span class="dt">double</span> arr<span class="op">[</span><span class="dv">10</span><span class="op">];</span> <span class="op">};</span> <span class="co">// Address in RDI</span></span> | |
| <span id="cb270-7"><a href="#cb270-7" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Mixed <span class="op">{</span> <span class="dt">int</span> x<span class="op">;</span> <span class="dt">double</span> y<span class="op">[</span><span class="dv">3</span><span class="op">];</span> <span class="op">};</span> <span class="co">// Address in RDI</span></span></code></pre></div> | |
| <h4 id="variable-arguments-va_args">Variable Arguments (va_args)</h4> | |
| <pre class="assembly"><code>; Before calling variadic function: | |
| mov eax, num_fp_args ; Number of XMM registers used (0-8) | |
| ; RAX = AL holds FP arg count for variadic functions</code></pre> | |
| <h3 id="b.3-stack-frame-layout">B.3 Stack Frame Layout</h3> | |
| <h4 id="stack-organization-high-to-low-address">Stack Organization (High | |
| to Low Address)</h4> | |
| <p>+————————+ Higher addresses | Previous frame | +————————+ | Return | |
| address | ← Pushed by CALL +————————+ | Previous RBP | ← Optional frame | |
| pointer +————————+ | Callee-saved regs | ← If used (RBX, R12-R15, etc.) | |
| +————————+ | Local variables | ← Locals and temporaries +————————+ | | |
| Alloca() space | ← Dynamic allocations +————————+ | Padding | ← For | |
| 16-byte alignment +————————+ | Outgoing args | ← Args 7+ for calls | |
| +————————+ | Red zone (128 bytes)| ← Leaf function scratch space | |
| +————————+ ← RSP (must be 16-byte aligned before CALL) Lower | |
| addresses</p> | |
| <h4 id="red-zone-1">Red Zone</h4> | |
| <ul> | |
| <li><p>128 bytes below RSP</p></li> | |
| <li><p>Available for leaf functions (functions that don’t call | |
| others)</p></li> | |
| <li><p>Not preserved across function calls</p></li> | |
| <li><p>Signal handlers don’t preserve red zone</p></li> | |
| </ul> | |
| <h4 id="stack-alignment-1">Stack Alignment</h4> | |
| <pre class="assembly"><code>; Stack must be 16-byte aligned before CALL | |
| ; (RSP + 8) % 16 == 0 at function entry | |
| ; RSP % 16 == 0 before making a call | |
| ; Typical prologue ensuring alignment: | |
| push rbp ; RSP now 16-byte aligned | |
| mov rbp, rsp | |
| sub rsp, N ; N must maintain 16-byte alignment | |
| and rsp, -16 ; Force alignment if needed</code></pre> | |
| <h3 id="b.4-return-values">B.4 Return Values</h3> | |
| <h4 id="scalar-returns">Scalar Returns</h4> | |
| <ul> | |
| <li><p><strong>Integers/Pointers</strong>: RAX (up to 64 bits)</p></li> | |
| <li><p><strong>128-bit integers</strong>: RDX:RAX (high:low)</p></li> | |
| <li><p><strong>Floating-point</strong>: XMM0 (float/double)</p></li> | |
| <li><p><strong>Long double</strong>: ST(0) (x87 stack)</p></li> | |
| <li><p><strong>Complex float</strong>: XMM0 (real), XMM1 | |
| (imaginary)</p></li> | |
| <li><p><strong>Complex double</strong>: XMM0 (real), XMM1 | |
| (imaginary)</p></li> | |
| </ul> | |
| <h4 id="aggregate-returns">Aggregate Returns</h4> | |
| <div class="sourceCode" id="cb273"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb273-1"><a href="#cb273-1" aria-hidden="true" tabindex="-1"></a><span class="co">// Small struct (≤16 bytes) - returned in registers</span></span> | |
| <span id="cb273-2"><a href="#cb273-2" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Pair <span class="op">{</span> <span class="dt">long</span> a<span class="op">,</span> b<span class="op">;</span> <span class="op">};</span> <span class="co">// RAX (a), RDX (b)</span></span> | |
| <span id="cb273-3"><a href="#cb273-3" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> FPPair <span class="op">{</span> <span class="dt">double</span> x<span class="op">,</span> y<span class="op">;</span> <span class="op">};</span> <span class="co">// XMM0 (x), XMM1 (y)</span></span> | |
| <span id="cb273-4"><a href="#cb273-4" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb273-5"><a href="#cb273-5" aria-hidden="true" tabindex="-1"></a><span class="co">// Large struct (>16 bytes) - returned via hidden pointer</span></span> | |
| <span id="cb273-6"><a href="#cb273-6" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Large <span class="op">{</span> <span class="dt">double</span> data<span class="op">[</span><span class="dv">10</span><span class="op">];</span> <span class="op">};</span> <span class="co">// Caller allocates, address in RDI</span></span> | |
| <span id="cb273-7"><a href="#cb273-7" aria-hidden="true" tabindex="-1"></a><span class="co">// Other args shift: 1st visible arg → RSI, 2nd → RDX, etc.</span></span></code></pre></div> | |
| <h3 id="b.5-function-prologue-and-epilogue">B.5 Function Prologue and | |
| Epilogue</h3> | |
| <h4 id="standard-prologue">Standard Prologue</h4> | |
| <pre class="assembly"><code>func: | |
| push rbp ; Save frame pointer (optional) | |
| mov rbp, rsp ; Establish frame pointer (optional) | |
| push rbx ; Save callee-saved registers | |
| push r12 | |
| push r13 | |
| push r14 | |
| push r15 | |
| sub rsp, N ; Allocate local space (maintain alignment) | |
| ; Function body...</code></pre> | |
| <h4 id="standard-epilogue">Standard Epilogue</h4> | |
| <pre class="assembly"><code> ; Function body ends | |
| add rsp, N ; Deallocate locals | |
| pop r15 ; Restore callee-saved registers | |
| pop r14 | |
| pop r13 | |
| pop r12 | |
| pop rbx | |
| pop rbp ; Restore frame pointer | |
| ret ; Return to caller</code></pre> | |
| <h4 id="leaf-function-optimization-1">Leaf Function Optimization</h4> | |
| <pre class="assembly"><code>leaf_func: | |
| ; No prologue needed if: | |
| ; - No calls to other functions | |
| ; - Uses only red zone (128 bytes) | |
| ; - Doesn't need callee-saved registers | |
| mov [rsp-8], rdi ; Can use red zone | |
| ; ... computation ... | |
| mov rax, [rsp-8] ; Return value | |
| ret</code></pre> | |
| <h3 id="b.6-system-calls">B.6 System Calls</h3> | |
| <h4 id="linux-system-call-convention">Linux System Call Convention</h4> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Register</th> | |
| <th>Usage</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>RAX</strong></td> | |
| <td>System call number</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RDI</strong></td> | |
| <td>1st argument</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RSI</strong></td> | |
| <td>2nd argument</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RDX</strong></td> | |
| <td>3rd argument</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R10</strong></td> | |
| <td>4th argument (not RCX!)</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R8</strong></td> | |
| <td>5th argument</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R9</strong></td> | |
| <td>6th argument</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RAX</strong></td> | |
| <td>Return value (-errno on error)</td> | |
| </tr> | |
| <tr> | |
| <td><strong>RCX</strong></td> | |
| <td>Destroyed (stores return address)</td> | |
| </tr> | |
| <tr> | |
| <td><strong>R11</strong></td> | |
| <td>Destroyed (stores RFLAGS)</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h4 id="system-call-example">System Call Example</h4> | |
| <pre class="assembly"><code>; write(1, "Hello\n", 6) | |
| mov rax, 1 ; sys_write | |
| mov rdi, 1 ; fd = stdout | |
| lea rsi, [msg] ; buffer | |
| mov rdx, 6 ; count | |
| syscall ; Make system call | |
| ; RAX = bytes written or -errno</code></pre> | |
| <h4 id="common-system-call-numbers">Common System Call Numbers</h4> | |
| <pre class="assembly"><code>; Linux x86-64 system calls (selection) | |
| SYS_read equ 0 | |
| SYS_write equ 1 | |
| SYS_open equ 2 | |
| SYS_close equ 3 | |
| SYS_mmap equ 9 | |
| SYS_mprotect equ 10 | |
| SYS_munmap equ 11 | |
| SYS_brk equ 12 | |
| SYS_ioctl equ 16 | |
| SYS_access equ 21 | |
| SYS_pipe equ 22 | |
| SYS_select equ 23 | |
| SYS_mremap equ 25 | |
| SYS_fork equ 57 | |
| SYS_vfork equ 58 | |
| SYS_execve equ 59 | |
| SYS_exit equ 60 | |
| SYS_wait4 equ 61 | |
| SYS_kill equ 62 | |
| SYS_uname equ 63 | |
| SYS_fcntl equ 72 | |
| SYS_flock equ 73 | |
| SYS_fsync equ 74 | |
| SYS_fdatasync equ 75 | |
| SYS_truncate equ 76 | |
| SYS_getdents equ 78 | |
| SYS_getcwd equ 79 | |
| SYS_chdir equ 80 | |
| SYS_fchdir equ 81 | |
| SYS_rename equ 82 | |
| SYS_mkdir equ 83 | |
| SYS_rmdir equ 84 | |
| SYS_creat equ 85 | |
| SYS_link equ 86 | |
| SYS_unlink equ 87 | |
| SYS_symlink equ 88 | |
| SYS_readlink equ 89 | |
| SYS_chmod equ 90 | |
| SYS_fchmod equ 91 | |
| SYS_chown equ 92 | |
| SYS_fchown equ 93 | |
| SYS_lchown equ 94 | |
| SYS_getuid equ 102 | |
| SYS_syslog equ 103 | |
| SYS_getgid equ 104 | |
| SYS_setuid equ 105 | |
| SYS_setgid equ 106 | |
| SYS_geteuid equ 107 | |
| SYS_getegid equ 108 | |
| SYS_setpgid equ 109 | |
| SYS_getppid equ 110 | |
| SYS_getpgrp equ 111 | |
| SYS_setsid equ 112 | |
| SYS_getsid equ 124 | |
| SYS_clone equ 56 | |
| SYS_exit_group equ 231</code></pre> | |
| <h3 id="b.7-thread-local-storage-tls">B.7 Thread-Local Storage | |
| (TLS)</h3> | |
| <h4 id="tls-access-models">TLS Access Models</h4> | |
| <pre class="assembly"><code>; Initial Exec (IE) - static TLS | |
| mov rax, QWORD PTR fs:variable@tpoff | |
| ; Local Exec (LE) - executable's TLS | |
| mov rax, QWORD PTR fs:variable@tpoff | |
| ; General Dynamic (GD) - dlopen'ed libraries | |
| lea rdi, variable@tlsgd[rip] | |
| call __tls_get_addr@plt | |
| ; Local Dynamic (LD) - multiple TLS vars | |
| lea rdi, variable@tlsld[rip] | |
| call __tls_get_addr@plt</code></pre> | |
| <h3 id="b.8-exception-handling">B.8 Exception Handling</h3> | |
| <h4 id="stack-unwinding-dwarf">Stack Unwinding (DWARF)</h4> | |
| <pre class="assembly"><code>.cfi_startproc ; Start of function | |
| .cfi_def_cfa_offset 16 ; Define CFA offset | |
| .cfi_offset rbp, -16 ; RBP saved at CFA-16 | |
| .cfi_def_cfa_register rbp ; Use RBP as frame base | |
| .cfi_endproc ; End of function</code></pre> | |
| <h4 id="c-exception-handling">C++ Exception Handling</h4> | |
| <ul> | |
| <li><p>Landing pads for catch blocks</p></li> | |
| <li><p>Personality routine: <code>__gxx_personality_v0</code></p></li> | |
| <li><p>Unwinding library: <code>libgcc_s.so</code> / | |
| <code>libunwind</code></p></li> | |
| </ul> | |
| <h3 id="b.9-data-alignment-requirements">B.9 Data Alignment | |
| Requirements</h3> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Type</th> | |
| <th>Size</th> | |
| <th>Alignment</th> | |
| <th>Notes</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><code>char</code></td> | |
| <td>1</td> | |
| <td>1</td> | |
| <td>No alignment requirement</td> | |
| </tr> | |
| <tr> | |
| <td><code>short</code></td> | |
| <td>2</td> | |
| <td>2</td> | |
| <td>Natural alignment</td> | |
| </tr> | |
| <tr> | |
| <td><code>int</code></td> | |
| <td>4</td> | |
| <td>4</td> | |
| <td>Natural alignment</td> | |
| </tr> | |
| <tr> | |
| <td><code>long</code></td> | |
| <td>8</td> | |
| <td>8</td> | |
| <td>Natural alignment</td> | |
| </tr> | |
| <tr> | |
| <td><code>float</code></td> | |
| <td>4</td> | |
| <td>4</td> | |
| <td>Natural alignment</td> | |
| </tr> | |
| <tr> | |
| <td><code>double</code></td> | |
| <td>8</td> | |
| <td>8</td> | |
| <td>Natural alignment</td> | |
| </tr> | |
| <tr> | |
| <td><code>long double</code></td> | |
| <td>16</td> | |
| <td>16</td> | |
| <td>Extended precision</td> | |
| </tr> | |
| <tr> | |
| <td><code>__int128</code></td> | |
| <td>16</td> | |
| <td>16</td> | |
| <td>GCC extension</td> | |
| </tr> | |
| <tr> | |
| <td><code>pointer</code></td> | |
| <td>8</td> | |
| <td>8</td> | |
| <td>All pointers are 64-bit</td> | |
| </tr> | |
| <tr> | |
| <td><code>__m64</code></td> | |
| <td>8</td> | |
| <td>8</td> | |
| <td>MMX type</td> | |
| </tr> | |
| <tr> | |
| <td><code>__m128</code></td> | |
| <td>16</td> | |
| <td>16</td> | |
| <td>SSE type</td> | |
| </tr> | |
| <tr> | |
| <td><code>__m256</code></td> | |
| <td>32</td> | |
| <td>32</td> | |
| <td>AVX type</td> | |
| </tr> | |
| <tr> | |
| <td><code>__m512</code></td> | |
| <td>64</td> | |
| <td>64</td> | |
| <td>AVX-512 type</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h4 id="structure-padding">Structure Padding</h4> | |
| <div class="sourceCode" id="cb281"><pre | |
| class="sourceCode c"><code class="sourceCode c"><span id="cb281-1"><a href="#cb281-1" aria-hidden="true" tabindex="-1"></a><span class="kw">struct</span> Example <span class="op">{</span></span> | |
| <span id="cb281-2"><a href="#cb281-2" aria-hidden="true" tabindex="-1"></a> <span class="dt">char</span> a<span class="op">;</span> <span class="co">// offset 0</span></span> | |
| <span id="cb281-3"><a href="#cb281-3" aria-hidden="true" tabindex="-1"></a> <span class="co">// 3 bytes padding</span></span> | |
| <span id="cb281-4"><a href="#cb281-4" aria-hidden="true" tabindex="-1"></a> <span class="dt">int</span> b<span class="op">;</span> <span class="co">// offset 4</span></span> | |
| <span id="cb281-5"><a href="#cb281-5" aria-hidden="true" tabindex="-1"></a> <span class="dt">char</span> c<span class="op">;</span> <span class="co">// offset 8</span></span> | |
| <span id="cb281-6"><a href="#cb281-6" aria-hidden="true" tabindex="-1"></a> <span class="co">// 7 bytes padding</span></span> | |
| <span id="cb281-7"><a href="#cb281-7" aria-hidden="true" tabindex="-1"></a> <span class="dt">double</span> d<span class="op">;</span> <span class="co">// offset 16</span></span> | |
| <span id="cb281-8"><a href="#cb281-8" aria-hidden="true" tabindex="-1"></a><span class="op">};</span> <span class="co">// Total size: 24 bytes (aligned to 8)</span></span></code></pre></div> | |
| <h3 id="b.10-executable-file-format-elf">B.10 Executable File Format | |
| (ELF)</h3> | |
| <h4 id="program-headers">Program Headers</h4> | |
| <p>Type Offset VirtAddr PhysAddr FileSize MemSize Flg Align PHDR 0x40 | |
| 0x400040 0x400040 0x1f8 0x1f8 R 0x8 INTERP 0x238 0x400238 0x400238 0x1c | |
| 0x1c R 0x1 LOAD 0x0 0x400000 0x400000 0x1234 0x1234 R E 0x200000 LOAD | |
| 0x1e10 0x601e10 0x601e10 0x230 0x238 RW 0x200000 DYNAMIC 0 Got it — | |
| here’s the <strong>System V AMD64 ABI Summary</strong> you were asking | |
| for, with all the key details pulled together into one structured | |
| reference.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="register-usage-and-preservation-rules">1️⃣ Register Usage and | |
| Preservation Rules</h3> | |
| <p><strong>General-purpose registers (64‑bit)</strong></p> | |
| <table> | |
| <colgroup> | |
| <col style="width: 7%" /> | |
| <col style="width: 58%" /> | |
| <col style="width: 33%" /> | |
| </colgroup> | |
| <thead> | |
| <tr> | |
| <th>Reg</th> | |
| <th>Purpose</th> | |
| <th>Preserved across calls?</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>RAX</td> | |
| <td>Return value / syscall number</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>RDI</td> | |
| <td>1ᵗʰ integer/pointer arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>RSI</td> | |
| <td>2ⁿᵈ integer/pointer arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>RDX</td> | |
| <td>3ʳᵈ integer/pointer arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>RCX</td> | |
| <td>4ᵗʰ integer/pointer arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>R8</td> | |
| <td>5ᵗʰ integer/pointer arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>R9</td> | |
| <td>6ᵗʰ integer/pointer arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>R10</td> | |
| <td>Scratch / 4ᵗʰ syscall arg</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>R11</td> | |
| <td>Scratch</td> | |
| <td>No</td> | |
| </tr> | |
| <tr> | |
| <td>RBX</td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>RBP</td> | |
| <td>Frame pointer (opt.) / callee-saved</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>RSP</td> | |
| <td>Stack pointer</td> | |
| <td>Yes</td> | |
| </tr> | |
| <tr> | |
| <td>R12–R15</td> | |
| <td>Callee-saved</td> | |
| <td>Yes</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p><strong>Vector/Floating registers</strong></p> | |
| <ul> | |
| <li><p>XMM0–XMM7: FP args/returns, caller‑saved</p></li> | |
| <li><p>XMM8–XMM15: Caller‑saved temps</p></li> | |
| <li><p>YMM/ZMM extend XMM — upper bits are <strong>not</strong> | |
| preserved</p></li> | |
| <li><p>K0–K7 (AVX‑512 masks): Caller‑saved</p></li> | |
| <li><p>–</p></li> | |
| </ul> | |
| <h3 id="calling-convention-essentials">2️⃣ Calling Convention | |
| Essentials</h3> | |
| <p><strong>Integer/pointer args</strong>:<br /> | |
| 1 → RDI<br /> | |
| 2 → RSI<br /> | |
| 3 → RDX<br /> | |
| 4 → RCX<br /> | |
| 5 → R8<br /> | |
| 6 → R9<br /> | |
| More → pushed on stack, right‑to‑left</p> | |
| <p><strong>Floating‑point args</strong>:<br /> | |
| 1 → XMM0<br /> | |
| 2 → XMM1<br /> | |
| … up to XMM7<br /> | |
| More → stack</p> | |
| <p><strong>Return values</strong>:</p> | |
| <ul> | |
| <li><p>Integer/pointer ≤64‑bit → RAX</p></li> | |
| <li><p>128‑bit integer → RDX:RAX</p></li> | |
| <li><p>Float/double → XMM0</p></li> | |
| <li><p>Complex float/double → XMM0 (real), XMM1 (imag)</p></li> | |
| <li><p>Small structs ≤16 bytes → registers per classification | |
| rules</p></li> | |
| <li><p>Larger aggregates → hidden pointer in RDI</p></li> | |
| </ul> | |
| <p><strong>Variadic functions</strong>: AL in RAX holds # of FP | |
| registers used.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="stack-frame-and-alignment">3️⃣ Stack Frame and Alignment</h3> | |
| <ul> | |
| <li><p><strong>16‑byte stack alignment before call</strong></p></li> | |
| <li><p>Red zone: 128 bytes below RSP usable in leaf functions</p></li> | |
| <li><p>Typical frame layout:</p></li> | |
| </ul> | |
| <table style="width:35%;"> | |
| <colgroup> | |
| <col style="width: 34%" /> | |
| </colgroup> | |
| <tbody> | |
| <tr> | |
| <td>args > 6 (overflow)</td> | |
| </tr> | |
| <tr> | |
| <td>local variables</td> | |
| </tr> | |
| <tr> | |
| <td>callee-saved registers</td> | |
| </tr> | |
| <tr> | |
| <td>saved RBP</td> | |
| </tr> | |
| <tr> | |
| <td>return address</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <pre><code> RSP → low addresses</code></pre> | |
| <p>Example prologue/epilogue:</p> | |
| <div class="sourceCode" id="cb283"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb283-1"><a href="#cb283-1" aria-hidden="true" tabindex="-1"></a><span class="fu">func:</span></span> | |
| <span id="cb283-2"><a href="#cb283-2" aria-hidden="true" tabindex="-1"></a> <span class="bu">push</span> <span class="kw">rbp</span></span> | |
| <span id="cb283-3"><a href="#cb283-3" aria-hidden="true" tabindex="-1"></a> <span class="bu">mov</span> <span class="kw">rbp</span><span class="op">,</span> <span class="kw">rsp</span></span> | |
| <span id="cb283-4"><a href="#cb283-4" aria-hidden="true" tabindex="-1"></a> <span class="bu">push</span> <span class="kw">rbx</span> <span class="kw">r12</span> <span class="kw">r13</span> <span class="kw">r14</span> <span class="kw">r15</span></span> | |
| <span id="cb283-5"><a href="#cb283-5" aria-hidden="true" tabindex="-1"></a> <span class="bu">sub</span> <span class="kw">rsp</span><span class="op">,</span> <span class="dv">32</span> <span class="co">; locals</span></span> | |
| <span id="cb283-6"><a href="#cb283-6" aria-hidden="true" tabindex="-1"></a> <span class="co">; body</span></span> | |
| <span id="cb283-7"><a href="#cb283-7" aria-hidden="true" tabindex="-1"></a> <span class="bu">add</span> <span class="kw">rsp</span><span class="op">,</span> <span class="dv">32</span></span> | |
| <span id="cb283-8"><a href="#cb283-8" aria-hidden="true" tabindex="-1"></a> <span class="bu">pop</span> <span class="kw">r15</span> <span class="kw">r14</span> <span class="kw">r13</span> <span class="kw">r12</span> <span class="kw">rbx</span></span> | |
| <span id="cb283-9"><a href="#cb283-9" aria-hidden="true" tabindex="-1"></a> <span class="bu">pop</span> <span class="kw">rbp</span></span> | |
| <span id="cb283-10"><a href="#cb283-10" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span></code></pre></div> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="system-calls-linux-amd64">4️⃣ System Calls (Linux AMD64)</h3> | |
| <p>Registers:</p> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Reg</th> | |
| <th>Purpose</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>RAX</td> | |
| <td>syscall number</td> | |
| </tr> | |
| <tr> | |
| <td>RDI</td> | |
| <td>arg1</td> | |
| </tr> | |
| <tr> | |
| <td>RSI</td> | |
| <td>arg2</td> | |
| </tr> | |
| <tr> | |
| <td>RDX</td> | |
| <td>arg3</td> | |
| </tr> | |
| <tr> | |
| <td>R10</td> | |
| <td>arg4</td> | |
| </tr> | |
| <tr> | |
| <td>R8</td> | |
| <td>arg5</td> | |
| </tr> | |
| <tr> | |
| <td>R9</td> | |
| <td>arg6</td> | |
| </tr> | |
| <tr> | |
| <td>RAX</td> | |
| <td>return value</td> | |
| </tr> | |
| <tr> | |
| <td>RCX,R11</td> | |
| <td>trashed</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p>Example:</p> | |
| <div class="sourceCode" id="cb284"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb284-1"><a href="#cb284-1" aria-hidden="true" tabindex="-1"></a><span class="co">; write(1, msg, len)</span></span> | |
| <span id="cb284-2"><a href="#cb284-2" aria-hidden="true" tabindex="-1"></a><span class="bu">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="dv">1</span> <span class="co">; SYS_write</span></span> | |
| <span id="cb284-3"><a href="#cb284-3" aria-hidden="true" tabindex="-1"></a><span class="bu">mov</span> <span class="kw">rdi</span><span class="op">,</span> <span class="dv">1</span> <span class="co">; fd = stdout</span></span> | |
| <span id="cb284-4"><a href="#cb284-4" aria-hidden="true" tabindex="-1"></a><span class="bu">lea</span> <span class="kw">rsi</span><span class="op">,</span> <span class="op">[</span>rel msg<span class="op">]</span></span> | |
| <span id="cb284-5"><a href="#cb284-5" aria-hidden="true" tabindex="-1"></a><span class="bu">mov</span> <span class="kw">rdx</span><span class="op">,</span> len</span> | |
| <span id="cb284-6"><a href="#cb284-6" aria-hidden="true" tabindex="-1"></a><span class="cf">syscall</span></span></code></pre></div> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="data-alignment-rules">5️⃣ Data Alignment Rules</h3> | |
| <table> | |
| <thead> | |
| <tr> | |
| <th>Type</th> | |
| <th>Size</th> | |
| <th>Align</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td>char</td> | |
| <td>1</td> | |
| <td>1</td> | |
| </tr> | |
| <tr> | |
| <td>short</td> | |
| <td>2</td> | |
| <td>2</td> | |
| </tr> | |
| <tr> | |
| <td>int/float</td> | |
| <td>4</td> | |
| <td>4</td> | |
| </tr> | |
| <tr> | |
| <td>long/double</td> | |
| <td>8</td> | |
| <td>8</td> | |
| </tr> | |
| <tr> | |
| <td>long double</td> | |
| <td>16</td> | |
| <td>16</td> | |
| </tr> | |
| <tr> | |
| <td>__m128</td> | |
| <td>16</td> | |
| <td>16</td> | |
| </tr> | |
| <tr> | |
| <td>__m256</td> | |
| <td>32</td> | |
| <td>32</td> | |
| </tr> | |
| <tr> | |
| <td>__m512</td> | |
| <td>64</td> | |
| <td>64</td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <p>Structs padded automatically to meet member alignment.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="threadlocal-storage-tls">6️⃣ Thread‑Local Storage (TLS)</h3> | |
| <p>Access via <strong>FS</strong>/<strong>GS</strong> segment | |
| register:</p> | |
| <div class="sourceCode" id="cb285"><pre | |
| class="sourceCode asm"><code class="sourceCode fasm"><span id="cb285-1"><a href="#cb285-1" aria-hidden="true" tabindex="-1"></a><span class="bu">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="dt">qword</span> <span class="dt">ptr</span> <span class="kw">fs</span><span class="op">:[</span>var<span class="fu">@</span><span class="er">t</span>poff<span class="op">]</span></span></code></pre></div> | |
| <p>Dynamic TLS via <code>__tls_get_addr</code> helper for libraries.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h3 id="exceptionunwind-info">7️⃣ Exception/Unwind Info</h3> | |
| <p>DWARF <code>.cfi_*</code> directives record stack frame layout for | |
| debuggers/exception handlers.</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <p>✅ This gives you the condensed but complete System V AMD64 ABI rules | |
| — if you want, I can now <strong>append ready‑to‑use assembly | |
| templates</strong> for functions, syscalls, and struct returns that | |
| match this ABI so you can code directly without looking things up. Would | |
| you like me to prepare that?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| <h2 id="appendix-c-nasmgasmasm-syntax-comparison">Appendix C: | |
| NASM/GAS/MASM Syntax Comparison</h2> | |
| <h3 id="c.1-basic-syntax-differences">C.1 Basic Syntax Differences</h3> | |
| <h4 id="instruction-format">Instruction Format</h4> | |
| <table style="width:100%;"> | |
| <colgroup> | |
| <col style="width: 19%" /> | |
| <col style="width: 13%" /> | |
| <col style="width: 26%" /> | |
| <col style="width: 28%" /> | |
| <col style="width: 13%" /> | |
| </colgroup> | |
| <thead> | |
| <tr> | |
| <th>Feature</th> | |
| <th>NASM</th> | |
| <th>GAS (AT&T)</th> | |
| <th>GAS (Intel)</th> | |
| <th>MASM</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| <tr> | |
| <td><strong>Operand Order</strong></td> | |
| <td><code>dest, src</code></td> | |
| <td><code>src, dest</code></td> | |
| <td><code>dest, src</code></td> | |
| <td><code>dest, src</code></td> | |
| </tr> | |
| <tr> | |
| <td><strong>Register Prefix</strong></td> | |
| <td>None</td> | |
| <td><code>%</code></td> | |
| <td>None</td> | |
| <td>None</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Immediate Prefix</strong></td> | |
| <td>None</td> | |
| <td><code>$</code></td> | |
| <td>None</td> | |
| <td>None</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Size Suffixes</strong></td> | |
| <td>Use directives</td> | |
| <td><code>b/w/l/q</code> suffix</td> | |
| <td>Use PTR</td> | |
| <td>Use PTR</td> | |
| </tr> | |
| <tr> | |
| <td><strong>Comments</strong></td> | |
| <td><code>;</code></td> | |
| <td><code>#</code> or <code>/* */</code></td> | |
| <td><code>#</code> or <code>/* */</code></td> | |
| <td><code>;</code></td> | |
| </tr> | |
| <tr> | |
| <td><strong>Hex Numbers</strong></td> | |
| <td><code>0x123</code> or <code>123h</code></td> | |
| <td><code>$0x123</code></td> | |
| <td><code>0x123</code></td> | |
| <td><code>123h</code> or <code>0123h</code></td> | |
| </tr> | |
| <tr> | |
| <td><strong>Binary Numbers</strong></td> | |
| <td><code>0b1010</code> or <code>1010b</code></td> | |
| <td><code>0b1010</code></td> | |
| <td><code>0b1010</code></td> | |
| <td><code>1010b</code></td> | |
| </tr> | |
| </tbody> | |
| </table> | |
| <h4 id="basic-instruction-examples">Basic Instruction Examples</h4> | |
| <div class="sourceCode" id="cb286"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb286-1"><a href="#cb286-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb286-2"><a href="#cb286-2" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="kw">rbx</span> <span class="co">; move rbx to rax</span></span> | |
| <span id="cb286-3"><a href="#cb286-3" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="dv">123</span> <span class="co">; move immediate</span></span> | |
| <span id="cb286-4"><a href="#cb286-4" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span><span class="kw">rbx</span><span class="op">]</span> <span class="co">; load from memory</span></span> | |
| <span id="cb286-5"><a href="#cb286-5" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="dt">byte</span> <span class="op">[</span><span class="kw">rax</span><span class="op">],</span> <span class="dv">5</span> <span class="co">; store byte</span></span> | |
| <span id="cb286-6"><a href="#cb286-6" aria-hidden="true" tabindex="-1"></a><span class="kw">add</span> <span class="kw">rax</span><span class="op">,</span> <span class="kw">rbx</span> <span class="co">; add registers</span></span> | |
| <span id="cb286-7"><a href="#cb286-7" aria-hidden="true" tabindex="-1"></a><span class="kw">lea</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span><span class="kw">rbx</span><span class="op">+</span><span class="kw">rcx</span><span class="op">*</span><span class="dv">4</span><span class="op">+</span><span class="dv">10</span><span class="op">]</span> <span class="co">; load effective address</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS (AT&T syntax) | |
| movq %rbx, %rax # move rbx to rax | |
| movq $123, %rax # move immediate | |
| movq (%rbx), %rax # load from memory | |
| movb $5, (%rax) # store byte | |
| addq %rbx, %rax # add registers | |
| leaq 10(%rbx,%rcx,4), %rax # load effective address</code></pre> | |
| <pre class="gas"><code># GAS (Intel syntax) | |
| .intel_syntax noprefix | |
| mov rax, rbx # move rbx to rax | |
| mov rax, 123 # move immediate | |
| mov rax, [rbx] # load from memory | |
| mov byte ptr [rax], 5 # store byte | |
| add rax, rbx # add registers | |
| lea rax, [rbx+rcx*4+10] # load effective address</code></pre> | |
| <pre class="masm"><code>; MASM | |
| mov rax, rbx ; move rbx to rax | |
| mov rax, 123 ; move immediate | |
| mov rax, [rbx] ; load from memory | |
| mov byte ptr [rax], 5 ; store byte | |
| add rax, rbx ; add registers | |
| lea rax, [rbx+rcx*4+10] ; load effective address</code></pre> | |
| <h3 id="c.2-memory-addressing">C.2 Memory Addressing</h3> | |
| <h4 id="direct-memory-access">Direct Memory Access</h4> | |
| <div class="sourceCode" id="cb290"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb290-1"><a href="#cb290-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb290-2"><a href="#cb290-2" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span><span class="bn">0x401000</span><span class="op">]</span> <span class="co">; absolute address</span></span> | |
| <span id="cb290-3"><a href="#cb290-3" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span>myvar<span class="op">]</span> <span class="co">; labeled address</span></span> | |
| <span id="cb290-4"><a href="#cb290-4" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span><span class="kw">rel</span> myvar<span class="op">]</span> <span class="co">; RIP-relative (explicit)</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS (AT&T) | |
| movq 0x401000, %rax # absolute address | |
| movq myvar, %rax # labeled address | |
| movq myvar(%rip), %rax # RIP-relative</code></pre> | |
| <pre class="gas"><code># GAS (Intel) | |
| mov rax, qword ptr [0x401000] # absolute | |
| mov rax, qword ptr [myvar] # labeled | |
| mov rax, qword ptr [rip+myvar] # RIP-relative</code></pre> | |
| <pre class="masm"><code>; MASM | |
| mov rax, qword ptr [401000h] ; absolute | |
| mov rax, qword ptr [myvar] ; labeled | |
| mov rax, qword ptr myvar ; also valid</code></pre> | |
| <h4 id="complex-addressing-modes-1">Complex Addressing Modes</h4> | |
| <div class="sourceCode" id="cb294"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb294-1"><a href="#cb294-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM - [base + index*scale + disp]</span></span> | |
| <span id="cb294-2"><a href="#cb294-2" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span><span class="kw">rbx</span> <span class="op">+</span> <span class="kw">rcx</span><span class="op">*</span><span class="dv">8</span> <span class="op">+</span> <span class="dv">16</span><span class="op">]</span></span> | |
| <span id="cb294-3"><a href="#cb294-3" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">[</span><span class="kw">rbx</span> <span class="op">+</span> <span class="dv">4</span><span class="op">*</span><span class="kw">rcx</span> <span class="op">-</span> <span class="dv">32</span><span class="op">]</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS (AT&T) - disp(base, index, scale) | |
| movq 16(%rbx, %rcx, 8), %rax | |
| movq -32(%rbx, %rcx, 4), %rax</code></pre> | |
| <pre class="gas"><code># GAS (Intel) | |
| mov rax, [rbx + rcx*8 + 16] | |
| mov rax, [rbx + rcx*4 - 32]</code></pre> | |
| <pre class="masm"><code>; MASM | |
| mov rax, [rbx + rcx*8 + 16] | |
| mov rax, [rbx + rcx*4 - 32]</code></pre> | |
| <h3 id="c.3-data-definitions">C.3 Data Definitions</h3> | |
| <h4 id="basic-data-types">Basic Data Types</h4> | |
| <div class="sourceCode" id="cb298"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb298-1"><a href="#cb298-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb298-2"><a href="#cb298-2" aria-hidden="true" tabindex="-1"></a><span class="kw">section</span> <span class="fu">.data</span></span> | |
| <span id="cb298-3"><a href="#cb298-3" aria-hidden="true" tabindex="-1"></a> byte_val <span class="dt">db</span> <span class="bn">0x12</span> <span class="co">; 1 byte</span></span> | |
| <span id="cb298-4"><a href="#cb298-4" aria-hidden="true" tabindex="-1"></a> word_val <span class="dt">dw</span> <span class="bn">0x1234</span> <span class="co">; 2 bytes</span></span> | |
| <span id="cb298-5"><a href="#cb298-5" aria-hidden="true" tabindex="-1"></a> dword_val <span class="dt">dd</span> <span class="bn">0x12345678</span> <span class="co">; 4 bytes</span></span> | |
| <span id="cb298-6"><a href="#cb298-6" aria-hidden="true" tabindex="-1"></a> qword_val <span class="dt">dq</span> <span class="bn">0x123456789ABCDEF</span> <span class="co">; 8 bytes</span></span> | |
| <span id="cb298-7"><a href="#cb298-7" aria-hidden="true" tabindex="-1"></a> float_val <span class="dt">dd</span> <span class="fl">3.14</span> <span class="co">; 32-bit float</span></span> | |
| <span id="cb298-8"><a href="#cb298-8" aria-hidden="true" tabindex="-1"></a> double_val <span class="dt">dq</span> <span class="fl">3.14159</span> <span class="co">; 64-bit double</span></span> | |
| <span id="cb298-9"><a href="#cb298-9" aria-hidden="true" tabindex="-1"></a> string_val <span class="dt">db</span> <span class="st">"Hello"</span><span class="op">,</span> <span class="dv">0</span> <span class="co">; null-terminated</span></span> | |
| <span id="cb298-10"><a href="#cb298-10" aria-hidden="true" tabindex="-1"></a> array_val <span class="dt">dd</span> <span class="dv">1</span><span class="op">,</span> <span class="dv">2</span><span class="op">,</span> <span class="dv">3</span><span class="op">,</span> <span class="dv">4</span><span class="op">,</span> <span class="dv">5</span> <span class="co">; array</span></span> | |
| <span id="cb298-11"><a href="#cb298-11" aria-hidden="true" tabindex="-1"></a> buffer <span class="dt">resb</span> <span class="dv">256</span> <span class="co">; uninitialized</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| .section .data | |
| byte_val: .byte 0x12 | |
| word_val: .word 0x1234 | |
| dword_val: .long 0x12345678 | |
| qword_val: .quad 0x123456789ABCDEF | |
| float_val: .float 3.14 | |
| double_val: .double 3.14159 | |
| string_val: .asciz "Hello" # null-terminated | |
| string2: .ascii "World" # not null-terminated | |
| array_val: .long 1, 2, 3, 4, 5 | |
| .section .bss | |
| buffer: .skip 256 # uninitialized</code></pre> | |
| <pre class="masm"><code>; MASM | |
| .data | |
| byte_val BYTE 12h | |
| word_val WORD 1234h | |
| dword_val DWORD 12345678h | |
| qword_val QWORD 123456789ABCDEFh | |
| float_val REAL4 3.14 | |
| double_val REAL8 3.14159 | |
| string_val BYTE "Hello", 0 | |
| array_val DWORD 1, 2, 3, 4, 5 | |
| buffer BYTE 256 DUP(?) ; uninitialized</code></pre> | |
| <h4 id="string-definitions">String Definitions</h4> | |
| <div class="sourceCode" id="cb301"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb301-1"><a href="#cb301-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb301-2"><a href="#cb301-2" aria-hidden="true" tabindex="-1"></a>str1 <span class="dt">db</span> <span class="st">'Hello World'</span><span class="op">,</span> <span class="bn">0x0A</span><span class="op">,</span> <span class="dv">0</span> <span class="co">; with newline</span></span> | |
| <span id="cb301-3"><a href="#cb301-3" aria-hidden="true" tabindex="-1"></a>str2 <span class="dt">db</span> <span class="st">`Hello</span><span class="ch">\n</span><span class="st">World</span><span class="ch">\0</span><span class="st">`</span> <span class="co">; C-style escapes</span></span> | |
| <span id="cb301-4"><a href="#cb301-4" aria-hidden="true" tabindex="-1"></a>str3 <span class="dt">times</span> <span class="dv">64</span> <span class="dt">db</span> <span class="dv">0</span> <span class="co">; 64 zeros</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| str1: .asciz "Hello World\n" # null-terminated | |
| str2: .string "Hello World" # same as .asciz | |
| str3: .fill 64, 1, 0 # 64 bytes of 0</code></pre> | |
| <pre class="masm"><code>; MASM | |
| str1 BYTE "Hello World", 0Ah, 0 | |
| str2 BYTE "Hello", 0Dh, 0Ah, "World", 0 | |
| str3 BYTE 64 DUP(0)</code></pre> | |
| <h3 id="c.4-sections-and-segments">C.4 Sections and Segments</h3> | |
| <div class="sourceCode" id="cb304"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb304-1"><a href="#cb304-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb304-2"><a href="#cb304-2" aria-hidden="true" tabindex="-1"></a><span class="kw">section</span> <span class="fu">.text</span></span> | |
| <span id="cb304-3"><a href="#cb304-3" aria-hidden="true" tabindex="-1"></a> <span class="kw">global</span> _start</span> | |
| <span id="cb304-4"><a href="#cb304-4" aria-hidden="true" tabindex="-1"></a><span class="fu">_start:</span></span> | |
| <span id="cb304-5"><a href="#cb304-5" aria-hidden="true" tabindex="-1"></a> <span class="co">; code here</span></span> | |
| <span id="cb304-6"><a href="#cb304-6" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb304-7"><a href="#cb304-7" aria-hidden="true" tabindex="-1"></a><span class="kw">section</span> <span class="fu">.data</span></span> | |
| <span id="cb304-8"><a href="#cb304-8" aria-hidden="true" tabindex="-1"></a> <span class="co">; initialized data</span></span> | |
| <span id="cb304-9"><a href="#cb304-9" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb304-10"><a href="#cb304-10" aria-hidden="true" tabindex="-1"></a><span class="kw">section</span> <span class="fu">.bss</span></span> | |
| <span id="cb304-11"><a href="#cb304-11" aria-hidden="true" tabindex="-1"></a> <span class="co">; uninitialized data</span></span> | |
| <span id="cb304-12"><a href="#cb304-12" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb304-13"><a href="#cb304-13" aria-hidden="true" tabindex="-1"></a><span class="kw">section</span> <span class="fu">.rodata</span></span> | |
| <span id="cb304-14"><a href="#cb304-14" aria-hidden="true" tabindex="-1"></a> <span class="co">; read-only data</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| .section .text | |
| .global _start | |
| _start: | |
| # code here | |
| .section .data | |
| # initialized data | |
| .section .bss | |
| # uninitialized data | |
| .section .rodata | |
| # read-only data</code></pre> | |
| <pre class="masm"><code>; MASM (Windows) | |
| .code | |
| main PROC | |
| ; code here | |
| main ENDP | |
| .data | |
| ; initialized data | |
| .data? | |
| ; uninitialized data | |
| .const | |
| ; read-only data</code></pre> | |
| <h3 id="c.5-macros-and-directives">C.5 Macros and Directives</h3> | |
| <h4 id="macro-definitions">Macro Definitions</h4> | |
| <div class="sourceCode" id="cb307"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb307-1"><a href="#cb307-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb307-2"><a href="#cb307-2" aria-hidden="true" tabindex="-1"></a><span class="ot">%macro</span> pushall <span class="dv">0</span></span> | |
| <span id="cb307-3"><a href="#cb307-3" aria-hidden="true" tabindex="-1"></a> <span class="kw">push</span> <span class="kw">rax</span></span> | |
| <span id="cb307-4"><a href="#cb307-4" aria-hidden="true" tabindex="-1"></a> <span class="kw">push</span> <span class="kw">rbx</span></span> | |
| <span id="cb307-5"><a href="#cb307-5" aria-hidden="true" tabindex="-1"></a> <span class="kw">push</span> <span class="kw">rcx</span></span> | |
| <span id="cb307-6"><a href="#cb307-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">push</span> <span class="kw">rdx</span></span> | |
| <span id="cb307-7"><a href="#cb307-7" aria-hidden="true" tabindex="-1"></a><span class="ot">%endmacro</span></span> | |
| <span id="cb307-8"><a href="#cb307-8" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb307-9"><a href="#cb307-9" aria-hidden="true" tabindex="-1"></a><span class="ot">%macro</span> add3 <span class="dv">3</span> <span class="co">; 3 parameters</span></span> | |
| <span id="cb307-10"><a href="#cb307-10" aria-hidden="true" tabindex="-1"></a> <span class="kw">mov</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">%</span><span class="dv">1</span></span> | |
| <span id="cb307-11"><a href="#cb307-11" aria-hidden="true" tabindex="-1"></a> <span class="kw">add</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">%</span><span class="dv">2</span></span> | |
| <span id="cb307-12"><a href="#cb307-12" aria-hidden="true" tabindex="-1"></a> <span class="kw">add</span> <span class="kw">rax</span><span class="op">,</span> <span class="op">%</span><span class="dv">3</span></span> | |
| <span id="cb307-13"><a href="#cb307-13" aria-hidden="true" tabindex="-1"></a><span class="ot">%endmacro</span></span> | |
| <span id="cb307-14"><a href="#cb307-14" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb307-15"><a href="#cb307-15" aria-hidden="true" tabindex="-1"></a><span class="co">; Usage</span></span> | |
| <span id="cb307-16"><a href="#cb307-16" aria-hidden="true" tabindex="-1"></a>pushall</span> | |
| <span id="cb307-17"><a href="#cb307-17" aria-hidden="true" tabindex="-1"></a>add3 <span class="kw">rdi</span><span class="op">,</span> <span class="kw">rsi</span><span class="op">,</span> <span class="kw">rdx</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| .macro pushall | |
| push %rax | |
| push %rbx | |
| push %rcx | |
| push %rdx | |
| .endm | |
| .macro add3 p1, p2, p3 | |
| movq \p1, %rax | |
| addq \p2, %rax | |
| addq \p3, %rax | |
| .endm | |
| # Usage | |
| pushall | |
| add3 %rdi, %rsi, %rdx</code></pre> | |
| <pre class="masm"><code>; MASM | |
| pushall MACRO | |
| push rax | |
| push rbx | |
| push rcx | |
| push rdx | |
| ENDM | |
| add3 MACRO p1, p2, p3 | |
| mov rax, p1 | |
| add rax, p2 | |
| add rax, p3 | |
| ENDM | |
| ; Usage | |
| pushall | |
| add3 rdi, rsi, rdx</code></pre> | |
| <h4 id="conditional-assembly">Conditional Assembly</h4> | |
| <div class="sourceCode" id="cb310"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb310-1"><a href="#cb310-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb310-2"><a href="#cb310-2" aria-hidden="true" tabindex="-1"></a><span class="ot">%ifdef</span> DEBUG</span> | |
| <span id="cb310-3"><a href="#cb310-3" aria-hidden="true" tabindex="-1"></a> <span class="kw">mov</span> <span class="kw">rdi</span><span class="op">,</span> debug_msg</span> | |
| <span id="cb310-4"><a href="#cb310-4" aria-hidden="true" tabindex="-1"></a> <span class="cf">call</span> print_debug</span> | |
| <span id="cb310-5"><a href="#cb310-5" aria-hidden="true" tabindex="-1"></a><span class="ot">%endif</span></span> | |
| <span id="cb310-6"><a href="#cb310-6" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb310-7"><a href="#cb310-7" aria-hidden="true" tabindex="-1"></a><span class="ot">%if</span> BUFFER_SIZE <span class="op">></span> <span class="dv">1024</span></span> | |
| <span id="cb310-8"><a href="#cb310-8" aria-hidden="true" tabindex="-1"></a> <span class="ot">%error</span> "Buffer too large<span class="st">"</span></span> | |
| <span id="cb310-9"><a href="#cb310-9" aria-hidden="true" tabindex="-1"></a><span class="ot">%elif</span> BUFFER_SIZE <span class="op"><</span> <span class="dv">16</span></span> | |
| <span id="cb310-10"><a href="#cb310-10" aria-hidden="true" tabindex="-1"></a> <span class="ot">%error</span> "Buffer too small<span class="st">"</span></span> | |
| <span id="cb310-11"><a href="#cb310-11" aria-hidden="true" tabindex="-1"></a><span class="ot">%endif</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| .ifdef DEBUG | |
| movq $debug_msg, %rdi | |
| call print_debug | |
| .endif | |
| .if BUFFER_SIZE > 1024 | |
| .error "Buffer too large" | |
| .elseif BUFFER_SIZE < 16 | |
| .error "Buffer too small" | |
| .endif</code></pre> | |
| <pre class="masm"><code>; MASM | |
| IFDEF DEBUG | |
| mov rdi, OFFSET debug_msg | |
| call print_debug | |
| ENDIF | |
| IF BUFFER_SIZE GT 1024 | |
| .ERR <Buffer too large> | |
| ELSEIF BUFFER_SIZE LT 16 | |
| .ERR <Buffer too small> | |
| ENDIF</code></pre> | |
| <h3 id="c.6-symbols-and-labels">C.6 Symbols and Labels</h3> | |
| <h4 id="global-and-external-symbols">Global and External Symbols</h4> | |
| <div class="sourceCode" id="cb313"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb313-1"><a href="#cb313-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb313-2"><a href="#cb313-2" aria-hidden="true" tabindex="-1"></a><span class="kw">global</span> main <span class="co">; export symbol</span></span> | |
| <span id="cb313-3"><a href="#cb313-3" aria-hidden="true" tabindex="-1"></a><span class="kw">extern</span> printf <span class="co">; import symbol</span></span> | |
| <span id="cb313-4"><a href="#cb313-4" aria-hidden="true" tabindex="-1"></a><span class="kw">extern</span> data_var</span> | |
| <span id="cb313-5"><a href="#cb313-5" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb313-6"><a href="#cb313-6" aria-hidden="true" tabindex="-1"></a><span class="fu">main:</span></span> | |
| <span id="cb313-7"><a href="#cb313-7" aria-hidden="true" tabindex="-1"></a> <span class="co">; function code</span></span> | |
| <span id="cb313-8"><a href="#cb313-8" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span> | |
| <span id="cb313-9"><a href="#cb313-9" aria-hidden="true" tabindex="-1"></a></span> | |
| <span id="cb313-10"><a href="#cb313-10" aria-hidden="true" tabindex="-1"></a><span class="fu">.loop_label:</span> <span class="co">; local label</span></span> | |
| <span id="cb313-11"><a href="#cb313-11" aria-hidden="true" tabindex="-1"></a> <span class="co">; loop code</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| .global main # export symbol | |
| .extern printf # import (optional) | |
| main: | |
| # function code | |
| ret | |
| .Lloop_label: # local label | |
| # loop code</code></pre> | |
| <pre class="masm"><code>; MASM | |
| PUBLIC main ; export symbol | |
| EXTERN printf:PROC ; import function | |
| EXTERN data_var:QWORD ; import variable | |
| main PROC | |
| ; function code | |
| ret | |
| main ENDP | |
| @@: ; anonymous label | |
| ; loop code</code></pre> | |
| <h4 id="alignment-directives">Alignment Directives</h4> | |
| <div class="sourceCode" id="cb316"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb316-1"><a href="#cb316-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb316-2"><a href="#cb316-2" aria-hidden="true" tabindex="-1"></a><span class="kw">align</span> <span class="dv">16</span> <span class="co">; align to 16 bytes</span></span> | |
| <span id="cb316-3"><a href="#cb316-3" aria-hidden="true" tabindex="-1"></a><span class="kw">alignb</span> <span class="dv">16</span><span class="op">,</span> nop <span class="co">; align with NOPs</span></span> | |
| <span id="cb316-4"><a href="#cb316-4" aria-hidden="true" tabindex="-1"></a><span class="dt">times</span> <span class="op">(</span><span class="dv">16</span><span class="op">-($-</span>$$<span class="op">)</span> <span class="op">%</span> <span class="dv">16</span><span class="op">)</span> nop <span class="co">; manual alignment</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS | |
| .align 16 # align to 2^16 on some platforms! | |
| .p2align 4 # align to 2^4 = 16 bytes (portable) | |
| .balign 16 # byte align to 16</code></pre> | |
| <pre class="masm"><code>; MASM | |
| ALIGN 16 ; align to 16 bytes</code></pre> | |
| <h3 id="c.7-procedure-definitions">C.7 Procedure Definitions</h3> | |
| <h4 id="function-declaration">Function Declaration</h4> | |
| <div class="sourceCode" id="cb319"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb319-1"><a href="#cb319-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb319-2"><a href="#cb319-2" aria-hidden="true" tabindex="-1"></a><span class="kw">section</span> <span class="fu">.text</span></span> | |
| <span id="cb319-3"><a href="#cb319-3" aria-hidden="true" tabindex="-1"></a><span class="kw">global</span> my_function</span> | |
| <span id="cb319-4"><a href="#cb319-4" aria-hidden="true" tabindex="-1"></a><span class="fu">my_function:</span></span> | |
| <span id="cb319-5"><a href="#cb319-5" aria-hidden="true" tabindex="-1"></a> <span class="kw">push</span> <span class="kw">rbp</span></span> | |
| <span id="cb319-6"><a href="#cb319-6" aria-hidden="true" tabindex="-1"></a> <span class="kw">mov</span> <span class="kw">rbp</span><span class="op">,</span> <span class="kw">rsp</span></span> | |
| <span id="cb319-7"><a href="#cb319-7" aria-hidden="true" tabindex="-1"></a> <span class="co">; function body</span></span> | |
| <span id="cb319-8"><a href="#cb319-8" aria-hidden="true" tabindex="-1"></a> <span class="kw">pop</span> <span class="kw">rbp</span></span> | |
| <span id="cb319-9"><a href="#cb319-9" aria-hidden="true" tabindex="-1"></a> <span class="cf">ret</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS (AT&T) | |
| .section .text | |
| .global my_function | |
| .type my_function, @function | |
| my_function: | |
| pushq %rbp | |
| movq %rsp, %rbp | |
| # function body | |
| popq %rbp | |
| ret | |
| .size my_function, .-my_function</code></pre> | |
| <pre class="gas"><code># GAS (Intel) | |
| .intel_syntax noprefix | |
| .global my_function | |
| .type my_function, @function | |
| my_function: | |
| push rbp | |
| mov rbp, rsp | |
| # function body | |
| pop rbp | |
| ret</code></pre> | |
| <pre class="masm"><code>; MASM | |
| .code | |
| my_function PROC | |
| push rbp | |
| mov rbp, rsp | |
| ; function body | |
| pop rbp | |
| ret | |
| my_function ENDP</code></pre> | |
| <h3 id="c.8-simd-instructions">C.8 SIMD Instructions</h3> | |
| <h4 id="sseavx-instructions">SSE/AVX Instructions</h4> | |
| <div class="sourceCode" id="cb323"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb323-1"><a href="#cb323-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb323-2"><a href="#cb323-2" aria-hidden="true" tabindex="-1"></a><span class="kw">movaps</span> <span class="kw">xmm0</span><span class="op">,</span> <span class="op">[</span><span class="kw">rsi</span><span class="op">]</span> <span class="co">; aligned move</span></span> | |
| <span id="cb323-3"><a href="#cb323-3" aria-hidden="true" tabindex="-1"></a><span class="kw">movups</span> <span class="kw">xmm1</span><span class="op">,</span> <span class="op">[</span><span class="kw">rdi</span><span class="op">]</span> <span class="co">; unaligned move</span></span> | |
| <span id="cb323-4"><a href="#cb323-4" aria-hidden="true" tabindex="-1"></a><span class="kw">addps</span> <span class="kw">xmm0</span><span class="op">,</span> <span class="kw">xmm1</span> <span class="co">; packed single add</span></span> | |
| <span id="cb323-5"><a href="#cb323-5" aria-hidden="true" tabindex="-1"></a><span class="kw">vmovaps</span> <span class="kw">ymm0</span><span class="op">,</span> <span class="op">[</span><span class="kw">rsi</span><span class="op">]</span> <span class="co">; AVX 256-bit</span></span> | |
| <span id="cb323-6"><a href="#cb323-6" aria-hidden="true" tabindex="-1"></a><span class="kw">vaddps</span> <span class="kw">ymm0</span><span class="op">,</span> <span class="kw">ymm1</span><span class="op">,</span> <span class="kw">ymm2</span> <span class="co">; AVX 3-operand</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS (AT&T) | |
| movaps (%rsi), %xmm0 # aligned move | |
| movups (%rdi), %xmm1 # unaligned move | |
| addps %xmm1, %xmm0 # packed single add | |
| vmovaps (%rsi), %ymm0 # AVX 256-bit | |
| vaddps %ymm2, %ymm1, %ymm0 # AVX 3-operand (src1, src2, dest)</code></pre> | |
| <pre class="gas"><code># GAS (Intel) | |
| movaps xmm0, [rsi] | |
| movups xmm1, [rdi] | |
| addps xmm0, xmm1 | |
| vmovaps ymm0, [rsi] | |
| vaddps ymm0, ymm1, ymm2</code></pre> | |
| <pre class="masm"><code>; MASM | |
| movaps xmm0, xmmword ptr [rsi] | |
| movups xmm1, xmmword ptr [rdi] | |
| addps xmm0, xmm1 | |
| vmovaps ymm0, ymmword ptr [rsi] | |
| vaddps ymm0, ymm1, ymm2</code></pre> | |
| <h4 id="avx-512-with-masking">AVX-512 with Masking</h4> | |
| <div class="sourceCode" id="cb327"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb327-1"><a href="#cb327-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb327-2"><a href="#cb327-2" aria-hidden="true" tabindex="-1"></a><span class="kw">vmovaps</span> <span class="kw">zmm0</span> <span class="op">{</span>k1<span class="op">},</span> <span class="op">[</span><span class="kw">rsi</span><span class="op">]</span> <span class="co">; masked load</span></span> | |
| <span id="cb327-3"><a href="#cb327-3" aria-hidden="true" tabindex="-1"></a><span class="kw">vaddps</span> <span class="kw">zmm0</span> <span class="op">{</span>k1<span class="op">}{</span>z<span class="op">},</span> <span class="kw">zmm1</span><span class="op">,</span> <span class="kw">zmm2</span> <span class="co">; masked add with zeroing</span></span></code></pre></div> | |
| <pre class="gas"><code># GAS (AT&T) | |
| vmovaps (%rsi), %zmm0{%k1} | |
| vaddps %zmm2, %zmm1, %zmm0{%k1}{z}</code></pre> | |
| <pre class="gas"><code># GAS (Intel) | |
| vmovaps zmm0{k1}, [rsi] | |
| vaddps zmm0{k1}{z}, zmm1, zmm2</code></pre> | |
| <pre class="masm"><code>; MASM | |
| vmovaps zmm0{k1}, zmmword ptr [rsi] | |
| vaddps zmm0{k1}{z}, zmm1, zmm2</code></pre> | |
| <h3 id="c.9-system-instructions">C.9 System Instructions</h3> | |
| <h4 id="privileged-instructions">Privileged Instructions</h4> | |
| <div class="sourceCode" id="cb331"><pre | |
| class="sourceCode nasm"><code class="sourceCode nasm"><span id="cb331-1"><a href="#cb331-1" aria-hidden="true" tabindex="-1"></a><span class="co">; NASM</span></span> | |
| <span id="cb331-2"><a href="#cb331-2" aria-hidden="true" tabindex="-1"></a><span class="kw">lgdt</span> <span class="op">[</span>gdtr<span class="op">]</span> <span class="co">; load GDT</span></span> | |
| <span id="cb331-3"><a href="#cb331-3" aria-hidden="true" tabindex="-1"></a><span class="kw">lidt</span> <span class="op">[</span>idtr<span class="op">]</span> <span class="co">; load IDT</span></span> | |
| <span id="cb331-4"><a href="#cb331-4" aria-hidden="true" tabindex="-1"></a><span class="kw">mov</span> <span class="kw">cr3</span><span class="op">,</span> <span class="kw">rax</span> <span class="co">;</span></span></code></pre></div> | |
| <p>From what I see, you’ve broken it down into logically separated | |
| sub‑sections:</p> | |
| <ul> | |
| <li><p><strong>Basic Syntax Differences</strong> — covers operand order, | |
| register/immediate prefixes, size suffixes and comment/number | |
| formats.</p></li> | |
| <li><p><strong>Memory Addressing</strong> — examples for absolute, label | |
| and RIP‑relative addressing, plus | |
| base+index*scale+displacement.</p></li> | |
| <li><p><strong>Data Definitions</strong> — maps NASM | |
| <code>.db/.dw</code>‑style to GAS <code>.byte/.word</code> and MASM | |
| <code>BYTE/WORD</code> formats, including strings, arrays and | |
| uninitialised buffers.</p></li> | |
| <li><p><strong>Sections/Segments</strong> — shows how the three syntaxes | |
| declare <code>.text</code>, <code>.data</code>, <code>.bss</code> and | |
| platform‑specific segments.</p></li> | |
| <li><p><strong>Macros/Directives</strong> — explicit macro examples for | |
| each assembler’s definition style, with conditional assembly | |
| patterns.</p></li> | |
| <li><p><strong>Symbols/Labels</strong> — global/external symbol | |
| exports/imports and alignment directives.</p></li> | |
| <li><p><strong>Procedure Definitions</strong> — function | |
| prologues/epilogues and declaration differences.</p></li> | |
| <li><p><strong>SIMD Instructions</strong> — SSE, AVX, AVX‑512 usage, | |
| operand order and how masking/zeroing is written in each | |
| dialect.</p></li> | |
| <li><p><strong>System Instructions</strong> — privileged op | |
| usage.</p></li> | |
| </ul> | |
| <p>This lines up completely with your “Appendix C: NASM/GAS/MASM Syntax | |
| Comparison” bullet in the earlier summary — the table of operand | |
| order/register/immediate formatting, addressing mode differences, | |
| directives, and SIMD masking syntax match exactly.</p> | |
| <p>If what you need is a consolidated, final deliverable, I can merge | |
| this detailed appendix with the Chapter 14–16 and Appendix A–B summaries | |
| you’ve already compiled into a single, continuous reference document. | |
| That way you’d have:</p> | |
| <ol type="1"> | |
| <li><p>High‑level narrative summaries for the chapter content.</p></li> | |
| <li><p>Quick‑reference tables from Appendix A and ABI notes from | |
| Appendix B.</p></li> | |
| <li><p>This full syntax comparison section as Appendix C.</p></li> | |
| </ol> | |
| <p>Do you want me to generate that combined reference so it’s ready to | |
| use as one cohesive manual?</p> | |
| <ul> | |
| <li>–</li> | |
| </ul> | |
| </main> | |
| </body> | |
| </html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment