Skip to content

Instantly share code, notes, and snippets.

@kunalspathak
Last active June 5, 2025 07:11
Show Gist options
  • Select an option

  • Save kunalspathak/b7c1253d32b0b5353a3e059ce46bf68f to your computer and use it in GitHub Desktop.

Select an option

Save kunalspathak/b7c1253d32b0b5353a3e059ce46bf68f to your computer and use it in GitHub Desktop.
NEON vs. SVE for VL=16B

Diffs are based on 15,914 contexts (4 MinOpts, 15,910 FullOpts).

Base JIT options: UseSveForVectorT=0

Diff JIT options: UseSveForVectorT=1

Overall (+768 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
benchmarks.run.windows.arm64.checked.mch 7,485,508 +768 +18.88%
FullOpts (+768 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
benchmarks.run.windows.arm64.checked.mch 7,485,104 +768 +18.88%
Example diffs
benchmarks.run.windows.arm64.checked.mch
-28 (-24.14%) : 14589.dasm - System.Numerics.Tests.Perf_VectorOf`1[ulong]:DotBenchmark():ulong:this (FullOpts)
@@ -10,8 +10,8 @@
 ;
 ;* V00 this         [V00    ] (  0,  0   )     ref  ->  zero-ref    this class-hnd single-def <System.Numerics.Tests.Perf_VectorOf`1[ulong]>
 ;# V01 OutArgs      [V01    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
-;  V02 tmp1         [V02,T02] (  3,  6   )  simd16  ->  d16         "Inlining Arg" <System.Numerics.Vector`1[ulong]>
-;  V03 tmp2         [V03,T03] (  3,  6   )  simd16  ->  d17         "Inlining Arg" <System.Numerics.Vector`1[ulong]>
+;  V02 tmp1         [V02,T02] (  2,  4   )  simd16  ->  d16         "Inlining Arg" <System.Numerics.Vector`1[ulong]>
+;  V03 tmp2         [V03,T03] (  2,  4   )  simd16  ->  d17         "Inlining Arg" <System.Numerics.Vector`1[ulong]>
 ;* V04 cse0         [V04,T01] (  0,  0   )   byref  ->  zero-ref    "CSE #02: aggressive"
 ;  V05 cse1         [V05,T00] (  3,  3   )    long  ->   x0         "CSE #03: aggressive"
 ;
@@ -34,17 +34,10 @@ G_M2272_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             movk    x0, #0xD1FFAB1E LSL #32
             ldr     q16, [x0]
             ldr     q17, [x0, #0x20]
+            mul     z16.d, z16.d, z17.d
+            addp    d16, v16.2d
             umov    x0, v16.d[0]
-            umov    x1, v17.d[0]
-            mul     x0, x0, x1
-            ins     v18.d[0], x0
-            umov    x0, v16.d[1]
-            umov    x1, v17.d[1]
-            mul     x0, x0, x1
-            ins     v18.d[1], x0
-            addp    d16, v18.2d
-            umov    x0, v16.d[0]
-						;; size=60 bbWeight=1 PerfScore 20.50
+						;; size=32 bbWeight=1 PerfScore 14.50
 G_M2272_IG04:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
@@ -57,7 +50,7 @@ G_M2272_IG05:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}
             b       G_M2272_IG03
 						;; size=20 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 116, prolog size 8, PerfScore 29.50, instruction count 29, allocated bytes for code 116 (MethodHash=7fa2f71f) for method System.Numerics.Tests.Perf_VectorOf`1[ulong]:DotBenchmark():ulong:this (FullOpts)
+; Total bytes of code 88, prolog size 8, PerfScore 23.50, instruction count 22, allocated bytes for code 88 (MethodHash=7fa2f71f) for method System.Numerics.Tests.Perf_VectorOf`1[ulong]:DotBenchmark():ulong:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -68,7 +61,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 29 (0x0001d) Actual length = 116 (0x000074)
+  Function Length   : 22 (0x00016) Actual length = 88 (0x000058)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-4 (-3.85%) : 4471.dasm - System.Numerics.Tests.Perf_VectorOf`1[long]:LessThanAllBenchmark():ubyte:this (FullOpts)
@@ -11,7 +11,6 @@
 ;# V01 OutArgs      [V01    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V02 cse0         [V02,T01] (  0,  0   )   byref  ->  zero-ref    "CSE #02: aggressive"
 ;  V03 cse1         [V03,T00] (  3,  3   )    long  ->   x0         "CSE #03: aggressive"
-;  V04 rat0         [V04,T02] (  3,  6   )  simd16  ->  d16         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
@@ -32,14 +31,13 @@ G_M64305_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             movk    x0, #0xD1FFAB1E LSL #32
             ldr     q16, [x0]
             ldr     q17, [x0, #0x20]
-            cmgt    v16.2d, v17.2d, v16.2d
-            mvni    v17.4s, #0
-            cmeq    v16.2d, v16.2d, v17.2d
-            uminp   v16.4s, v16.4s, v16.4s
-            umov    x0, v16.d[0]
-            cmn     x0, #1
+            ptrue   p0.d
+            cmpgt   p0.d, p0/z, z17.d, z16.d
+            ptrue   p1.d
+            cntp    x0, p1, p0.d
+            cmp     x0, #2
             cset    x0, eq
-						;; size=48 bbWeight=1 PerfScore 13.00
+						;; size=44 bbWeight=1 PerfScore 17.50
 G_M64305_IG04:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
@@ -52,7 +50,7 @@ G_M64305_IG05:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
             b       G_M64305_IG03
 						;; size=20 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 104, prolog size 8, PerfScore 22.00, instruction count 26, allocated bytes for code 104 (MethodHash=d6c204ce) for method System.Numerics.Tests.Perf_VectorOf`1[long]:LessThanAllBenchmark():ubyte:this (FullOpts)
+; Total bytes of code 100, prolog size 8, PerfScore 26.50, instruction count 25, allocated bytes for code 100 (MethodHash=d6c204ce) for method System.Numerics.Tests.Perf_VectorOf`1[long]:LessThanAllBenchmark():ubyte:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -63,7 +61,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 26 (0x0001a) Actual length = 104 (0x000068)
+  Function Length   : 25 (0x00019) Actual length = 100 (0x000064)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
-4 (-3.85%) : 8634.dasm - System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualAllBenchmark():ubyte:this (FullOpts)
@@ -11,7 +11,6 @@
 ;# V01 OutArgs      [V01    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V02 cse0         [V02,T01] (  0,  0   )   byref  ->  zero-ref    "CSE #02: aggressive"
 ;  V03 cse1         [V03,T00] (  3,  3   )    long  ->   x0         "CSE #03: aggressive"
-;  V04 rat0         [V04,T02] (  3,  6   )  simd16  ->  d16         "ReplaceWithLclVar is creating a new local variable"
 ;
 ; Lcl frame size = 0
 
@@ -32,14 +31,13 @@ G_M45761_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             movk    x0, #0xD1FFAB1E LSL #32
             ldr     q16, [x0]
             ldr     q17, [x0, #0x20]
-            fcmge   v16.4s, v16.4s, v17.4s
-            mvni    v17.4s, #0
-            cmeq    v16.4s, v16.4s, v17.4s
-            uminp   v16.4s, v16.4s, v16.4s
-            umov    x0, v16.d[0]
-            cmn     x0, #1
+            ptrue   p0.s
+            fcmge   p0.s, p0/z, z16.s, z17.s
+            ptrue   p1.s
+            cntp    x0, p1, p0.s
+            cmp     x0, #4
             cset    x0, eq
-						;; size=48 bbWeight=1 PerfScore 13.00
+						;; size=44 bbWeight=1 PerfScore 15.50
 G_M45761_IG04:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
@@ -52,7 +50,7 @@ G_M45761_IG05:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
             b       G_M45761_IG03
 						;; size=20 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 104, prolog size 8, PerfScore 22.00, instruction count 26, allocated bytes for code 104 (MethodHash=e2754d3e) for method System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualAllBenchmark():ubyte:this (FullOpts)
+; Total bytes of code 100, prolog size 8, PerfScore 24.50, instruction count 25, allocated bytes for code 100 (MethodHash=e2754d3e) for method System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualAllBenchmark():ubyte:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -63,7 +61,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 26 (0x0001a) Actual length = 104 (0x000068)
+  Function Length   : 25 (0x00019) Actual length = 100 (0x000064)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
+12 (+15.79%) : 12389.dasm - System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
@@ -37,15 +37,18 @@ G_M50163_IG02:        ; bbWeight=0.25, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, b
             mov     w1, #0xD1FFAB1E
 						;; size=20 bbWeight=0.25 PerfScore 1.38
 G_M50163_IG03:        ; bbWeight=4, gcrefRegs=0000 {}, byrefRegs=0001 {x0}, byref, isz
-            ldr     q16, [x0], #0x10
+            ldr     q16, [x0]
             mov     v17.16b, v16.16b
-            fcvtl   v17.2d, v17.2s
-            fcvtl2  v16.2d, v16.4s
-            eor     v16.2d, v17.2d, v16.2d
-            eor     v0.2d, v0.2d, v16.2d
+            ptrue   p0.s
+            fcvt    z17.d, p0/m, z17.s
+            ptrue   p0.s
+            fcvtlt  z16.d, p0/m, z16.s
+            eor     z16.d, z17.d, z16.d
+            eor     z0.d, z0.d, z16.d
+            add     x0, x0, #16
             sub     w1, w1, #1
             tbz     w1, #31, G_M50163_IG03
-						;; size=32 bbWeight=4 PerfScore 48.00
+						;; size=44 bbWeight=4 PerfScore 70.00
 G_M50163_IG04:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
             ; byrRegs -[x0]
             ldp     fp, lr, [sp], #0x10
@@ -57,7 +60,7 @@ G_M50163_IG05:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
             brk     #0
 						;; size=8 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 76, prolog size 8, PerfScore 51.75, instruction count 19, allocated bytes for code 76 (MethodHash=b7133c0c) for method System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
+; Total bytes of code 88, prolog size 8, PerfScore 73.75, instruction count 22, allocated bytes for code 88 (MethodHash=b7133c0c) for method System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -68,7 +71,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 19 (0x00013) Actual length = 76 (0x00004c)
+  Function Length   : 22 (0x00016) Actual length = 88 (0x000058)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
+32 (+10.39%) : 4104.dasm - SveBenchmarks.StrIndexOf:SveIndexOf():int:this (FullOpts)
@@ -54,7 +54,7 @@ G_M22667_IG04:        ; bbWeight=0.50, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, b
 						;; size=4 bbWeight=0.50 PerfScore 0.25
 G_M22667_IG05:        ; bbWeight=1, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, byref, isz
             ldrh    w3, [x0, #0x14]
-            dup     v16.8h, w3
+            mov     z16.h, w3
             ldr     w3, [x0, #0x10]
             mov     w4, wzr
             whilelt p0.h, w4, w3
@@ -78,38 +78,46 @@ G_M22667_IG06:        ; bbWeight=4, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, byre
 						;; size=48 bbWeight=4 PerfScore 98.00
 G_M22667_IG07:        ; bbWeight=2, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, byref, isz
             mov     w3, wzr
-            umov    w4, v17.h[0]
+            mov     z18.h, h17
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #1
-            umov    w4, v17.h[1]
+            mov     z18.h, z17.h[1]
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #2
-            umov    w4, v17.h[2]
+            mov     z18.h, z17.h[2]
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #3
-            umov    w4, v17.h[3]
+            mov     z18.h, z17.h[3]
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #4
-            umov    w4, v17.h[4]
+            mov     z18.h, z17.h[4]
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #5
-            umov    w4, v17.h[5]
+            mov     z18.h, z17.h[5]
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #6
-            umov    w4, v17.h[6]
+            mov     z18.h, z17.h[6]
+            umov    w4, v18.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
             mov     w3, #7
-            umov    w4, v17.h[7]
+            mov     z17.h, z17.h[7]
+            umov    w4, v17.h[0]
             cmp     w4, #1
             beq     G_M22667_IG11
-						;; size=128 bbWeight=2 PerfScore 48.00
+						;; size=160 bbWeight=2 PerfScore 80.00
 G_M22667_IG08:        ; bbWeight=4, gcrefRegs=0001 {x0}, byrefRegs=0000 {}, byref, isz
             ldr     w3, [x0, #0x10]
             whilelt p0.h, w1, w3
@@ -135,7 +143,7 @@ G_M22667_IG12:        ; bbWeight=0.50, epilog, nogc, extend
             ret     lr
 						;; size=8 bbWeight=0.50 PerfScore 1.00
 
-; Total bytes of code 308, prolog size 12, PerfScore 217.50, instruction count 77, allocated bytes for code 308 (MethodHash=8b05a774) for method SveBenchmarks.StrIndexOf:SveIndexOf():int:this (FullOpts)
+; Total bytes of code 340, prolog size 12, PerfScore 249.50, instruction count 85, allocated bytes for code 340 (MethodHash=8b05a774) for method SveBenchmarks.StrIndexOf:SveIndexOf():int:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -146,7 +154,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 77 (0x0004d) Actual length = 308 (0x000134)
+  Function Length   : 85 (0x00055) Actual length = 340 (0x000154)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
+8 (+10.00%) : 13623.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:LessThanBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
@@ -31,8 +31,10 @@ G_M61526_IG03:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
             movk    x0, #0xD1FFAB1E LSL #32
             ldr     q0, [x0]
             ldr     q16, [x0, #0x20]
-            cmgt    v0.4s, v16.4s, v0.4s
-						;; size=24 bbWeight=1 PerfScore 8.50
+            ptrue   p0.s
+            cmpgt   p0.s, p0/z, z16.s, z0.s
+            mov     z0.s, p0/z, #1
+						;; size=32 bbWeight=1 PerfScore 14.50
 G_M61526_IG04:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
@@ -45,7 +47,7 @@ G_M61526_IG05:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {
             b       G_M61526_IG03
 						;; size=20 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 80, prolog size 8, PerfScore 17.50, instruction count 20, allocated bytes for code 80 (MethodHash=6f960fa9) for method System.Numerics.Tests.Perf_VectorOf`1[int]:LessThanBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
+; Total bytes of code 88, prolog size 8, PerfScore 23.50, instruction count 22, allocated bytes for code 88 (MethodHash=6f960fa9) for method System.Numerics.Tests.Perf_VectorOf`1[int]:LessThanBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -56,7 +58,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 20 (0x00014) Actual length = 80 (0x000050)
+  Function Length   : 22 (0x00016) Actual length = 88 (0x000058)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
Details

Size improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same size Improvements (bytes) Regressions (bytes)
benchmarks.run.windows.arm64.checked.mch 62 5 33 24 -44 +812

PerfScore improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same PerfScore Improvements (PerfScore) Regressions (PerfScore) PerfScore Overall in FullOpts
benchmarks.run.windows.arm64.checked.mch 62 2 56 4 -13.07% +21.71% +0.0674%

Context information

Collection Diffed contexts MinOpts FullOpts Missed, base Missed, diff
benchmarks.run.windows.arm64.checked.mch 15,914 4 15,910 0 (0.00%) 0 (0.00%)

jit-analyze output

benchmarks.run.windows.arm64.checked.mch

Summary of Code Size diffs:
(Lower is better)

Total bytes of base: 7485508 (overridden on cmd)
Total bytes of diff: 7486276 (overridden on cmd)
Total bytes of delta: 768 (0.01 % of base)
    diff is a regression.
    relative diff is a regression.
Detail diffs


Top file regressions (bytes):
         512 : 882.dasm (134.74% of base)
          52 : 7616.dasm (5.68% of base)
          52 : 8537.dasm (5.51% of base)
          32 : 4104.dasm (10.39% of base)
          12 : 12389.dasm (15.79% of base)
           8 : 13244.dasm (1.71% of base)
           8 : 11577.dasm (10.00% of base)
           8 : 13623.dasm (10.00% of base)
           8 : 12967.dasm (10.00% of base)
           8 : 3961.dasm (9.52% of base)
           8 : 8098.dasm (2.94% of base)
           8 : 6510.dasm (10.00% of base)
           8 : 8636.dasm (10.00% of base)
           8 : 8947.dasm (10.00% of base)
           8 : 5808.dasm (9.52% of base)
           4 : 13627.dasm (4.17% of base)
           4 : 8612.dasm (4.76% of base)
           4 : 6248.dasm (5.26% of base)
           4 : 4132.dasm (5.26% of base)
           4 : 5720.dasm (4.17% of base)

Top file improvements (bytes):
         -28 : 14589.dasm (-24.14% of base)
          -4 : 4471.dasm (-3.85% of base)
          -4 : 8634.dasm (-3.85% of base)
          -4 : 12980.dasm (-3.85% of base)
          -4 : 12391.dasm (-3.85% of base)

36 total files with Code Size differences (5 improved, 31 regressed), 23 unchanged.

Top method regressions (bytes):
         512 (134.74% of base) : 882.dasm - SveBenchmarks.StrCmp:SveStrCmp():long:this (FullOpts)
          52 ( 5.68% of base) : 7616.dasm - Algorithms.VectorDoubleRenderer:RenderSingleThreadedNoADT(float,float,float,float,float):this (FullOpts)
          52 ( 5.51% of base) : 8537.dasm - Algorithms.VectorDoubleRenderer:RenderSingleThreadedWithADT(float,float,float,float,float):this (FullOpts)
          32 (10.39% of base) : 4104.dasm - SveBenchmarks.StrIndexOf:SveIndexOf():int:this (FullOpts)
          12 (15.79% of base) : 12389.dasm - System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
           8 ( 2.94% of base) : 8098.dasm - SeekUnroll:Test(int):ubyte:this (FullOpts)
           8 (10.00% of base) : 8947.dasm - System.Numerics.Tests.Perf_VectorOf`1[byte]:AndNotBenchmark():System.Numerics.Vector`1[byte]:this (FullOpts)
           8 (10.00% of base) : 6510.dasm - System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualBenchmark():System.Numerics.Vector`1[float]:this (FullOpts)
           8 ( 9.52% of base) : 3961.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:ConditionalSelectBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
           8 (10.00% of base) : 13623.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:LessThanBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
           8 (10.00% of base) : 8636.dasm - System.Numerics.Tests.Perf_VectorOf`1[uint]:GreaterThanBenchmark():System.Numerics.Vector`1[uint]:this (FullOpts)
           8 (10.00% of base) : 11577.dasm - System.Numerics.Tests.Perf_VectorOf`1[uint]:LessThanOrEqualBenchmark():System.Numerics.Vector`1[uint]:this (FullOpts)
           8 ( 9.52% of base) : 5808.dasm - System.Numerics.Tests.Perf_VectorOf`1[ulong]:MaxBenchmark():System.Numerics.Vector`1[ulong]:this (FullOpts)
           8 (10.00% of base) : 12967.dasm - System.Numerics.Tests.Perf_VectorOf`1[ushort]:EqualsStaticBenchmark():System.Numerics.Vector`1[ushort]:this (FullOpts)
           8 ( 1.71% of base) : 13244.dasm - System.Text.Latin1Utility:NarrowUtf16ToLatin1(ulong,ulong,ulong):ulong (FullOpts)
           4 ( 6.25% of base) : 5151.dasm - System.Numerics.Tests.Perf_VectorConvert:Convert[int,float](int[]):System.Numerics.Vector`1[float] (FullOpts)
           4 ( 4.17% of base) : 13966.dasm - System.Numerics.Tests.Perf_VectorOf`1[byte]:GreaterThanAnyBenchmark():ubyte:this (FullOpts)
           4 ( 4.17% of base) : 12968.dasm - System.Numerics.Tests.Perf_VectorOf`1[float]:InequalityOperatorBenchmark():ubyte:this (FullOpts)
           4 ( 4.17% of base) : 13627.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:EqualsBenchmark():ubyte:this (FullOpts)
           4 ( 4.17% of base) : 12975.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:GreaterThanOrEqualAnyBenchmark():ubyte:this (FullOpts)

Top method improvements (bytes):
         -28 (-24.14% of base) : 14589.dasm - System.Numerics.Tests.Perf_VectorOf`1[ulong]:DotBenchmark():ulong:this (FullOpts)
          -4 (-3.85% of base) : 12391.dasm - System.Numerics.Tests.Perf_VectorOf`1[byte]:GreaterThanAllBenchmark():ubyte:this (FullOpts)
          -4 (-3.85% of base) : 8634.dasm - System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualAllBenchmark():ubyte:this (FullOpts)
          -4 (-3.85% of base) : 4471.dasm - System.Numerics.Tests.Perf_VectorOf`1[long]:LessThanAllBenchmark():ubyte:this (FullOpts)
          -4 (-3.85% of base) : 12980.dasm - System.Numerics.Tests.Perf_VectorOf`1[uint]:LessThanOrEqualAllBenchmark():ubyte:this (FullOpts)

Top method regressions (percentages):
         512 (134.74% of base) : 882.dasm - SveBenchmarks.StrCmp:SveStrCmp():long:this (FullOpts)
          12 (15.79% of base) : 12389.dasm - System.Numerics.Tests.Perf_VectorConvert:Widen[float,double](float[]):System.Numerics.Vector`1[double] (FullOpts)
          32 (10.39% of base) : 4104.dasm - SveBenchmarks.StrIndexOf:SveIndexOf():int:this (FullOpts)
           8 (10.00% of base) : 8947.dasm - System.Numerics.Tests.Perf_VectorOf`1[byte]:AndNotBenchmark():System.Numerics.Vector`1[byte]:this (FullOpts)
           8 (10.00% of base) : 6510.dasm - System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualBenchmark():System.Numerics.Vector`1[float]:this (FullOpts)
           8 (10.00% of base) : 13623.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:LessThanBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
           8 (10.00% of base) : 8636.dasm - System.Numerics.Tests.Perf_VectorOf`1[uint]:GreaterThanBenchmark():System.Numerics.Vector`1[uint]:this (FullOpts)
           8 (10.00% of base) : 11577.dasm - System.Numerics.Tests.Perf_VectorOf`1[uint]:LessThanOrEqualBenchmark():System.Numerics.Vector`1[uint]:this (FullOpts)
           8 (10.00% of base) : 12967.dasm - System.Numerics.Tests.Perf_VectorOf`1[ushort]:EqualsStaticBenchmark():System.Numerics.Vector`1[ushort]:this (FullOpts)
           8 ( 9.52% of base) : 3961.dasm - System.Numerics.Tests.Perf_VectorOf`1[int]:ConditionalSelectBenchmark():System.Numerics.Vector`1[int]:this (FullOpts)
           8 ( 9.52% of base) : 5808.dasm - System.Numerics.Tests.Perf_VectorOf`1[ulong]:MaxBenchmark():System.Numerics.Vector`1[ulong]:this (FullOpts)
           4 ( 6.25% of base) : 5151.dasm - System.Numerics.Tests.Perf_VectorConvert:Convert[int,float](int[]):System.Numerics.Vector`1[float] (FullOpts)
          52 ( 5.68% of base) : 7616.dasm - Algorithms.VectorDoubleRenderer:RenderSingleThreadedNoADT(float,float,float,float,float):this (FullOpts)
           4 ( 5.56% of base) : 4105.dasm - System.Numerics.Tests.Perf_VectorOf`1[ubyte]:NegateBenchmark():System.Numerics.Vector`1[ubyte]:this (FullOpts)
          52 ( 5.51% of base) : 8537.dasm - Algorithms.VectorDoubleRenderer:RenderSingleThreadedWithADT(float,float,float,float,float):this (FullOpts)
           4 ( 5.26% of base) : 13965.dasm - System.Numerics.Tests.Perf_VectorOf`1[long]:AbsBenchmark():System.Numerics.Vector`1[long]:this (FullOpts)
           4 ( 5.26% of base) : 12343.dasm - System.Numerics.Tests.Perf_VectorOf`1[long]:UnaryNegateOperatorBenchmark():System.Numerics.Vector`1[long]:this (FullOpts)
           4 ( 5.26% of base) : 4132.dasm - System.Numerics.Tests.Perf_VectorOf`1[ushort]:OnesComplementBenchmark():System.Numerics.Vector`1[ushort]:this (FullOpts)
           4 ( 5.26% of base) : 6248.dasm - System.Numerics.Tests.Perf_VectorOf`1[ushort]:OnesComplementOperatorBenchmark():System.Numerics.Vector`1[ushort]:this (FullOpts)
           4 ( 4.76% of base) : 8612.dasm - System.Numerics.Tests.Perf_VectorOf`1[ulong]:MinBenchmark():System.Numerics.Vector`1[ulong]:this (FullOpts)

Top method improvements (percentages):
         -28 (-24.14% of base) : 14589.dasm - System.Numerics.Tests.Perf_VectorOf`1[ulong]:DotBenchmark():ulong:this (FullOpts)
          -4 (-3.85% of base) : 12391.dasm - System.Numerics.Tests.Perf_VectorOf`1[byte]:GreaterThanAllBenchmark():ubyte:this (FullOpts)
          -4 (-3.85% of base) : 8634.dasm - System.Numerics.Tests.Perf_VectorOf`1[float]:GreaterThanOrEqualAllBenchmark():ubyte:this (FullOpts)
          -4 (-3.85% of base) : 4471.dasm - System.Numerics.Tests.Perf_VectorOf`1[long]:LessThanAllBenchmark():ubyte:this (FullOpts)
          -4 (-3.85% of base) : 12980.dasm - System.Numerics.Tests.Perf_VectorOf`1[uint]:LessThanOrEqualAllBenchmark():ubyte:this (FullOpts)


Diffs are based on 29,064 contexts (3 MinOpts, 29,061 FullOpts).

MISSED contexts: base: 0 (0.00%), diff: 102 (0.35%)

Base JIT options: UseSveForVectorT=0

Diff JIT options: UseSveForVectorT=1

Overall (+1,968 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
realworld.run.windows.arm64.checked.mch 14,279,912 +1,968 -0.23%
FullOpts (+1,968 bytes)
Collection Base size (bytes) Diff size (bytes) PerfScore in Diffs
realworld.run.windows.arm64.checked.mch 13,914,572 +1,968 -0.23%
Example diffs
realworld.run.windows.arm64.checked.mch
-44 (-1.90%) : 1683.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:Setup():this (FullOpts)
@@ -94,7 +94,7 @@
 ;  V83 tmp63        [V83,T56] (  2,  1   )     int  ->   x2         "Inlining Arg"
 ;  V84 tmp64        [V84,T57] (  2,  1   )    long  ->  x21         "Inlining Arg"
 ;* V85 tmp65        [V85,T05] (  0,  0   )  struct (32) zero-ref    do-not-enreg[SF] ld-addr-op "Inline ldloca(s) first use temp" <BepuPhysics.RigidPose>
-;  V86 tmp66        [V86,T83] (  4, 63.36)  simd12  ->  d11         multireg-ret multireg-dest ld-addr-op "Inline stloc first use temp" <System.Numerics.Vector3>
+;  V86 tmp66        [V86,T83] (  4, 63.36)  simd12  ->  [fp-0x200]  multireg-ret multireg-dest ld-addr-op "Inline stloc first use temp" <System.Numerics.Vector3>
 ;* V87 tmp67        [V87    ] (  0,  0   )  simd12  ->  zero-ref    "Inline stloc first use temp" <System.Numerics.Vector3>
 ;* V88 tmp68        [V88    ] (  0,  0   )   float  ->  zero-ref    "impAppendStmt"
 ;* V89 tmp69        [V89    ] (  0,  0   )   float  ->  zero-ref    "impAppendStmt"
@@ -106,11 +106,11 @@
 ;* V95 tmp75        [V95    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
 ;* V96 tmp76        [V96    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
 ;* V97 tmp77        [V97    ] (  0,  0   )   float  ->  zero-ref    "Inlining Arg"
-;  V98 tmp78        [V98,T91] (  3, 47.52)  double  ->  d13         "Inline stloc first use temp"
+;  V98 tmp78        [V98,T91] (  3, 47.52)  double  ->  d11         "Inline stloc first use temp"
 ;  V99 tmp79        [V99,T84] (  4, 63.36)  double  ->   d0         "Inline stloc first use temp"
 ;* V100 tmp80       [V100    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Quaternion>
 ;* V101 tmp81       [V101,T06] (  0,  0   )  struct (32) zero-ref    do-not-enreg[SF] ld-addr-op "Inline ldloca(s) first use temp" <BepuPhysics.RigidPose>
-;  V102 tmp82       [V102,T85] (  4, 63.36)  simd12  ->  [fp-0x200]  multireg-ret multireg-dest ld-addr-op spill-single-def "Inline stloc first use temp" <System.Numerics.Vector3>
+;  V102 tmp82       [V102,T85] (  4, 63.36)  simd12  ->  [fp-0x210]  multireg-ret multireg-dest ld-addr-op "Inline stloc first use temp" <System.Numerics.Vector3>
 ;* V103 tmp83       [V103    ] (  0,  0   )  simd12  ->  zero-ref    "Inline stloc first use temp" <System.Numerics.Vector3>
 ;* V104 tmp84       [V104    ] (  0,  0   )   float  ->  zero-ref    "impAppendStmt"
 ;* V105 tmp85       [V105    ] (  0,  0   )   float  ->  zero-ref    "impAppendStmt"
@@ -122,7 +122,7 @@
 ;* V111 tmp91       [V111    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
 ;* V112 tmp92       [V112    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
 ;* V113 tmp93       [V113    ] (  0,  0   )   float  ->  zero-ref    "Inlining Arg"
-;  V114 tmp94       [V114,T92] (  3, 47.52)  double  ->  [fp-0x208]  spill-single-def "Inline stloc first use temp"
+;  V114 tmp94       [V114,T92] (  3, 47.52)  double  ->  [fp-0x218]  spill-single-def "Inline stloc first use temp"
 ;  V115 tmp95       [V115,T86] (  4, 63.36)  double  ->   d0         "Inline stloc first use temp"
 ;* V116 tmp96       [V116    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "Inline ldloca(s) first use temp" <System.Numerics.Quaternion>
 ;* V117 tmp97       [V117    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
@@ -155,7 +155,7 @@
 ;* V144 tmp124      [V144    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[float]>
 ;  V145 tmp125      [V145,T41] (  2,  1   )     ref  ->   x0         class-hnd single-def "Inlining Arg" <BepuUtilities.Memory.BufferPool>
 ;* V146 tmp126      [V146    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;  V147 tmp127      [V147    ] (  2,  0.50)  struct (16) [fp-0x218]  do-not-enreg[XS] must-init addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuUtilities.Memory.Buffer`1[ubyte]>
+;  V147 tmp127      [V147    ] (  2,  0.50)  struct (16) [fp-0x228]  do-not-enreg[XS] must-init addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuUtilities.Memory.Buffer`1[ubyte]>
 ;* V148 tmp128      [V148    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
 ;* V149 tmp129      [V149    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;* V150 tmp130      [V150    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
@@ -170,10 +170,10 @@
 ;* V159 tmp139      [V159    ] (  0,  0   )  struct (16) zero-ref    multireg-arg ld-addr-op "NewObj constructor temp" <System.Span`1[System.Numerics.Vector3]>
 ;* V160 tmp140      [V160    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
 ;  V161 tmp141      [V161,T18] (  4,  7   )     ref  ->  x20         class-hnd single-def "Inlining Arg" <BepuUtilities.Memory.BufferPool>
-;  V162 tmp142      [V162    ] (  4,  3.50)  struct (48) [fp-0x248]  do-not-enreg[XSF] must-init addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuPhysics.Collidables.HullData>
+;  V162 tmp142      [V162    ] (  4,  3.50)  struct (48) [fp-0x258]  do-not-enreg[XSF] must-init addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuPhysics.Collidables.HullData>
 ;  V163 tmp143      [V163,T42] (  2,  1   )     ref  ->   x0         class-hnd single-def "Inlining Arg" <BepuUtilities.Memory.BufferPool>
 ;* V164 tmp144      [V164    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
-;  V165 tmp145      [V165    ] (  3,  0.75)  struct (16) [fp-0x258]  do-not-enreg[XS] must-init addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuUtilities.Memory.Buffer`1[ubyte]>
+;  V165 tmp145      [V165    ] (  3,  0.75)  struct (16) [fp-0x268]  do-not-enreg[XS] must-init addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuUtilities.Memory.Buffer`1[ubyte]>
 ;* V166 tmp146      [V166    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
 ;* V167 tmp147      [V167    ] (  0,  0   )     int  ->  zero-ref    single-def
 ;* V168 tmp148      [V168    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
@@ -253,9 +253,9 @@
 ;* V242 tmp222      [V242    ] (  0,  0   )     int  ->  zero-ref    "field V143._length (fldOffset=0x8)" P-INDEP
 ;* V243 tmp223      [V243    ] (  0,  0   )   byref  ->  zero-ref    "field V144._reference (fldOffset=0x0)" P-INDEP
 ;* V244 tmp224      [V244    ] (  0,  0   )     int  ->  zero-ref    "field V144._length (fldOffset=0x8)" P-INDEP
-;  V245 tmp225      [V245    ] (  2,  0.50)    long  ->  [fp-0x218]  do-not-enreg[X] addr-exposed "field V147.Memory (fldOffset=0x0)" P-DEP
-;  V246 tmp226      [V246    ] (  1,  0.25)     int  ->  [fp-0x210]  do-not-enreg[X] addr-exposed "field V147.length (fldOffset=0x8)" P-DEP
-;  V247 tmp227      [V247    ] (  1,  0.25)     int  ->  [fp-0x20C]  do-not-enreg[X] addr-exposed "field V147.Id (fldOffset=0xc)" P-DEP
+;  V245 tmp225      [V245    ] (  2,  0.50)    long  ->  [fp-0x228]  do-not-enreg[X] addr-exposed "field V147.Memory (fldOffset=0x0)" P-DEP
+;  V246 tmp226      [V246    ] (  1,  0.25)     int  ->  [fp-0x220]  do-not-enreg[X] addr-exposed "field V147.length (fldOffset=0x8)" P-DEP
+;  V247 tmp227      [V247    ] (  1,  0.25)     int  ->  [fp-0x21C]  do-not-enreg[X] addr-exposed "field V147.Id (fldOffset=0xc)" P-DEP
 ;  V248 tmp228      [V248,T78] (  2,  0.50)    long  ->  x21         single-def "field V153.Memory (fldOffset=0x0)" P-INDEP
 ;* V249 tmp229      [V249    ] (  0,  0   )     int  ->  zero-ref    single-def "field V153.length (fldOffset=0x8)" P-INDEP
 ;* V250 tmp230      [V250    ] (  0,  0   )     int  ->  zero-ref    single-def "field V153.Id (fldOffset=0xc)" P-INDEP
@@ -264,9 +264,9 @@
 ;* V253 tmp233      [V253    ] (  0,  0   )     int  ->  zero-ref    "field V157.Id (fldOffset=0xc)" P-INDEP
 ;  V254 tmp234      [V254,T22] (  3,  3   )   byref  ->  x21         single-def "field V159._reference (fldOffset=0x0)" P-INDEP
 ;* V255 tmp235      [V255    ] (  0,  0   )     int  ->  zero-ref    single-def "field V159._length (fldOffset=0x8)" P-INDEP
-;  V256 tmp236      [V256    ] (  2,  0.50)    long  ->  [fp-0x258]  do-not-enreg[X] addr-exposed "field V165.Memory (fldOffset=0x0)" P-DEP
-;  V257 tmp237      [V257    ] (  1,  0.25)     int  ->  [fp-0x250]  do-not-enreg[X] addr-exposed "field V165.length (fldOffset=0x8)" P-DEP
-;  V258 tmp238      [V258    ] (  2,  0.50)     int  ->  [fp-0x24C]  do-not-enreg[X] addr-exposed "field V165.Id (fldOffset=0xc)" P-DEP
+;  V256 tmp236      [V256    ] (  2,  0.50)    long  ->  [fp-0x268]  do-not-enreg[X] addr-exposed "field V165.Memory (fldOffset=0x0)" P-DEP
+;  V257 tmp237      [V257    ] (  1,  0.25)     int  ->  [fp-0x260]  do-not-enreg[X] addr-exposed "field V165.length (fldOffset=0x8)" P-DEP
+;  V258 tmp238      [V258    ] (  2,  0.50)     int  ->  [fp-0x25C]  do-not-enreg[X] addr-exposed "field V165.Id (fldOffset=0xc)" P-DEP
 ;* V259 tmp239      [V259    ] (  0,  0   )    long  ->  zero-ref    single-def "field V171.Memory (fldOffset=0x0)" P-INDEP
 ;* V260 tmp240      [V260    ] (  0,  0   )     int  ->  zero-ref    single-def "field V171.length (fldOffset=0x8)" P-INDEP
 ;* V261 tmp241      [V261    ] (  0,  0   )     int  ->  zero-ref    single-def "field V171.Id (fldOffset=0xc)" P-INDEP
@@ -286,20 +286,20 @@
 ;  V275 tmp255      [V275,T36] (  2,  1.25)     int  ->   x1         single-def "V07.[012..016)"
 ;* V276 tmp256      [V276    ] (  0,  0   )  simd12  ->  zero-ref    "V08.[000..012)"
 ;* V277 tmp257      [V277    ] (  0,  0   )  simd12  ->  zero-ref    "V08.[016..028)"
-;  V278 tmp258      [V278,T110] (  2, 31.68)   float  ->  d14         "V15.[000..004)"
-;  V279 tmp259      [V279,T111] (  2, 31.68)   float  ->  d15         "V15.[004..008)"
-;  V280 tmp260      [V280,T112] (  2, 31.68)   float  ->  d11         "V15.[008..012)"
-;  V281 tmp261      [V281,T113] (  2, 31.68)   float  ->  [fp-0x25C]  spill-single-def "V15.[012..016)"
+;  V278 tmp258      [V278,T110] (  2, 31.68)   float  ->  d12         "V15.[000..004)"
+;  V279 tmp259      [V279,T111] (  2, 31.68)   float  ->  d13         "V15.[004..008)"
+;  V280 tmp260      [V280,T112] (  2, 31.68)   float  ->  d10         "V15.[008..012)"
+;  V281 tmp261      [V281,T113] (  2, 31.68)   float  ->  d11         "V15.[012..016)"
 ;* V282 tmp262      [V282    ] (  0,  0   )  simd12  ->  zero-ref    "V15.[016..028)"
 ;  V283 tmp263      [V283,T114] (  2, 31.68)   float  ->  d17         "V16.[000..004)"
 ;  V284 tmp264      [V284,T115] (  2, 31.68)   float  ->  d18         "V16.[004..008)"
-;  V285 tmp265      [V285,T116] (  2, 31.68)   float  ->  d19         "V16.[008..012)"
+;  V285 tmp265      [V285,T116] (  2, 31.68)   float  ->  d15         "V16.[008..012)"
 ;  V286 tmp266      [V286,T117] (  2, 31.68)   float  ->  d16         "V16.[012..016)"
 ;* V287 tmp267      [V287    ] (  0,  0   )  simd12  ->  zero-ref    "V16.[016..028)"
 ;* V288 tmp268      [V288,T87] (  0,  0   )  simd16  ->  zero-ref    do-not-enreg[SF] "V85.[000..016)"
-;  V289 tmp269      [V289,T118] (  2, 31.68)  simd12  ->   d9         "V85.[016..028)"
+;  V289 tmp269      [V289,T118] (  2, 31.68)  simd12  ->  [fp-0x280]  "V85.[016..028)"
 ;* V290 tmp270      [V290,T88] (  0,  0   )  simd16  ->  zero-ref    do-not-enreg[SF] "V101.[000..016)"
-;  V291 tmp271      [V291,T119] (  2, 31.68)  simd12  ->  d13         "V101.[016..028)"
+;  V291 tmp271      [V291,T119] (  2, 31.68)  simd12  ->  [fp-0x290]  "V101.[016..028)"
 ;  V292 tmp272      [V292,T17] (  4,  8   )   byref  ->   x2         single-def "Spilling address for field-by-field copy"
 ;  V293 tmp273      [V293,T29] (  3,  1.50)     ref  ->   x0         single-def "arr expr"
 ;* V294 tmp274      [V294    ] (  0,  0   )     int  ->  zero-ref    "index expr"
@@ -316,27 +316,27 @@
 ;  V305 tmp285      [V305,T33] (  3,  1.50)     ref  ->   x0         single-def "arr expr"
 ;* V306 tmp286      [V306    ] (  0,  0   )     int  ->  zero-ref    "index expr"
 ;  V307 tmp287      [V307,T47] (  2,  1   )   byref  ->   x0         single-def "argument with side effect"
-;  V308 tmp288      [V308    ] (  2,  4   )  struct (48) [fp-0x290]  do-not-enreg[XS] addr-exposed "by-value struct argument" <BepuPhysics.Collidables.HullData>
+;  V308 tmp288      [V308    ] (  2,  4   )  struct (48) [fp-0x2C0]  do-not-enreg[XS] addr-exposed "by-value struct argument" <BepuPhysics.Collidables.HullData>
 ;  V309 tmp289      [V309,T34] (  3,  1.50)     ref  ->   x0         single-def "arr expr"
 ;* V310 tmp290      [V310    ] (  0,  0   )     int  ->  zero-ref    "index expr"
 ;  V311 tmp291      [V311,T48] (  2,  1   )   byref  ->   x0         single-def "argument with side effect"
-;  V312 GsCookie    [V312    ] (  1,  1   )    long  ->  [fp-0x2A8]  do-not-enreg[X] addr-exposed "GSSecurityCookie"
-;  V313 cse0        [V313,T101] (  2, 31.68)   float  ->  d12         "CSE #05: aggressive"
+;  V312 GsCookie    [V312    ] (  1,  1   )    long  ->  [fp-0x2D0]  do-not-enreg[X] addr-exposed "GSSecurityCookie"
+;  V313 cse0        [V313,T101] (  2, 31.68)   float  ->  d11         "CSE #05: aggressive"
 ;  V314 cse1        [V314,T102] (  2, 31.68)   float  ->  d16         "CSE #09: aggressive"
-;  V315 cse2        [V315,T103] (  2, 31.68)   float  ->  d15         "CSE #03: aggressive"
-;  V316 cse3        [V316,T104] (  2, 31.68)   float  ->  d11         "CSE #04: aggressive"
-;  V317 cse4        [V317,T105] (  2, 31.68)   float  ->  [fp-0x294]  spill-single-def "CSE #07: aggressive"
-;  V318 cse5        [V318,T106] (  2, 31.68)   float  ->  [fp-0x298]  spill-single-def "CSE #08: aggressive"
-;  V319 cse6        [V319,T107] (  2, 31.68)   float  ->  d14         "CSE #02: aggressive"
-;  V320 cse7        [V320,T108] (  2, 31.68)   float  ->  [fp-0x29C]  spill-single-def "CSE #06: aggressive"
+;  V315 cse2        [V315,T103] (  2, 31.68)   float  ->  d13         "CSE #03: aggressive"
+;  V316 cse3        [V316,T104] (  2, 31.68)   float  ->  d10         "CSE #04: aggressive"
+;  V317 cse4        [V317,T105] (  2, 31.68)   float  ->  [fp-0x2C4]  spill-single-def "CSE #07: aggressive"
+;  V318 cse5        [V318,T106] (  2, 31.68)   float  ->  d15         "CSE #08: aggressive"
+;  V319 cse6        [V319,T107] (  2, 31.68)   float  ->  d12         "CSE #02: aggressive"
+;  V320 cse7        [V320,T108] (  2, 31.68)   float  ->  [fp-0x2C8]  spill-single-def "CSE #06: aggressive"
 ;  V321 cse8        [V321,T07] (  4, 63.36)    long  ->   x1         "CSE #11: aggressive"
 ;  V322 cse9        [V322,T16] (  3, 12   )    long  ->   x2         "CSE #23: moderate"
 ;  V323 cse10       [V323,T109] (  3, 31.93)   float  ->   d8         hoist "CSE #01: aggressive"
 ;  V324 cse11       [V324,T13] (  4, 16   )    long  ->   x1         "CSE #22: aggressive"
 ;  V325 rat0        [V325,T35] (  3,  1.50)     int  ->   x0         "ReplaceWithLclVar is creating a new local variable"
-;  TEMP_01                                  simd16  ->  [fp-0x2B8]
+;  TEMP_01                                  simd16  ->  [fp-0x2E0]
 ;
-; Lcl frame size = 592
+; Lcl frame size = 624
 
 G_M18337_IG01:        ; bbWeight=0.25, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
             stp     d8, d9, [sp, #-0x80]!
@@ -618,49 +618,47 @@ G_M18337_IG04:        ; bbWeight=15.84, gcrefRegs=180000 {x19 x20}, byrefRegs=20
             movk    x1, #0xD1FFAB1E LSL #16
             movk    x1, #0xD1FFAB1E LSL #32
             ldr     x1, [x1]
-            mov     v10.d[0], v9.d[1]
+            str     z9, [fp, #-40, mul vl]	// [V289 tmp269]
             blr     x1
             ; gcrRegs -[x0]
             ; gcr arg pop 0
-            mov     v11.s[2], v2.s[0]
-            mov     v11.s[1], v1.s[0]
-            mov     v11.s[0], v0.s[0]
+            mov     v10.s[2], v2.s[0]
+            mov     v10.s[1], v1.s[0]
+            mov     v10.s[0], v0.s[0]
             mov     x0, x20
             ; gcrRegs +[x0]
             movz    x1, #0xD1FFAB1E      // code for System.Random:NextSingle():float:this
             movk    x1, #0xD1FFAB1E LSL #16
             movk    x1, #0xD1FFAB1E LSL #32
             ldr     x1, [x1]
-            mov     v12.d[0], v11.d[1]
+            str     z10, [fp, #-32, mul vl]	// [V86 tmp66]
             blr     x1
             ; gcrRegs -[x0]
             ; gcr arg pop 0
             fmul    s0, s0, s8
             fcvt    d0, s0
             fmov    d16, #0.5000
-            fmul    d13, d0, d16
-            fmov    d0, d13
+            fmul    d11, d0, d16
+            fmov    d0, d11
             bl      <unknown method>
             ; gcr arg pop 0
-            mov     v11.d[1], v12.d[0]
-            dup     s16, v11.s[0]
+            ldr     z10, [fp, #-32, mul vl]	// [V86 tmp66]
+            dup     s16, v10.s[0]
             fcvt    d16, s16
             fmul    d16, d16, d0
-            fcvt    s14, d16
-            dup     s16, v11.s[1]
+            fcvt    s12, d16
+            dup     s16, v10.s[1]
             fcvt    d16, s16
             fmul    d16, d16, d0
-            fcvt    s15, d16
-            dup     s16, v11.s[2]
+            fcvt    s13, d16
+            dup     s16, v10.s[2]
             fcvt    d16, s16
             fmul    d0, d16, d0
-            fcvt    s11, d0
-            fmov    d0, d13
+            fcvt    s10, d0
+            fmov    d0, d11
             bl      <unknown method>
             ; gcr arg pop 0
-            fcvt    s12, d0
-            movn    xip1, #0xD1FFAB1E
-            str     s12, [fp, xip1]	// [V281 tmp261]
+            fcvt    s11, d0
             mov     x0, x20
             ; gcrRegs +[x0]
             movz    x1, #0xD1FFAB1E      // code for System.Random:NextSingle():float:this
@@ -670,9 +668,9 @@ G_M18337_IG04:        ; bbWeight=15.84, gcrefRegs=180000 {x19 x20}, byrefRegs=20
             blr     x1
             ; gcrRegs -[x0]
             ; gcr arg pop 0
-            fmov    s13, s0
+            fmov    s14, s0
             movn    xip1, #0xD1FFAB1E
-            str     q13, [fp, xip1]	// [TEMP_01]
+            str     q14, [fp, xip1]	// [TEMP_01]
             mov     x0, x20
             ; gcrRegs +[x0]
             movz    x1, #0xD1FFAB1E      // code for System.Random:NextSingle():float:this
@@ -683,10 +681,10 @@ G_M18337_IG04:        ; bbWeight=15.84, gcrefRegs=180000 {x19 x20}, byrefRegs=20
             ; gcrRegs -[x0]
             ; gcr arg pop 0
             movn    xip1, #0xD1FFAB1E
-            ldr     q13, [fp, xip1]	// [TEMP_01]
-            ins     v13.s[1], v0.s[0]
+            ldr     q14, [fp, xip1]	// [TEMP_01]
+            ins     v14.s[1], v0.s[0]
             movn    xip1, #0xD1FFAB1E
-            str     q13, [fp, xip1]	// [TEMP_01]
+            str     q14, [fp, xip1]	// [TEMP_01]
             mov     x0, x20
             ; gcrRegs +[x0]
             movz    x1, #0xD1FFAB1E      // code for System.Random:NextSingle():float:this
@@ -697,36 +695,34 @@ G_M18337_IG04:        ; bbWeight=15.84, gcrefRegs=180000 {x19 x20}, byrefRegs=20
             ; gcrRegs -[x0]
             ; gcr arg pop 0
             movn    xip1, #0xD1FFAB1E
-            ldr     q13, [fp, xip1]	// [TEMP_01]
-            ins     v13.s[2], v0.s[0]
+            ldr     q14, [fp, xip1]	// [TEMP_01]
+            ins     v14.s[2], v0.s[0]
             movi    v16.16b, #0
-            ins     v13.s[3], v16.s[0]
+            ins     v14.s[3], v16.s[0]
...
-16 (-1.40%) : 1242.dasm - BepuPhysics.Collidables.Compound:FindLocalOverlaps[BepuPhysics.CollisionDetection.CollisionTasks.ChildOverlapsCollection](byref,byref,byref,float,BepuUtilities.Memory.BufferPool,BepuPhysics.Collidables.Shapes,ulong):this (FullOpts)
@@ -17,15 +17,15 @@
 ;  V06 arg6         [V06,T24] (  3,  6   )     ref  ->  x21         class-hnd single-def <BepuPhysics.Collidables.Shapes>
 ;  V07 arg7         [V07,T39] (  3,  3   )    long  ->   x6         single-def
 ;  V08 loc0         [V08,T60] (  2,  2   )  simd12  ->  d16         ld-addr-op <System.Numerics.Vector3>
-;  V09 loc1         [V09,T52] (  3,  9   )  simd12  ->   d8         ld-addr-op <System.Numerics.Vector3>
+;  V09 loc1         [V09,T52] (  3,  9   )  simd12  ->  [fp+0x70]   ld-addr-op <System.Numerics.Vector3>
 ;* V10 loc2         [V10    ] (  0,  0   )  struct (32) zero-ref    do-not-enreg[SF] ld-addr-op <BepuPhysics.Trees.TreeRay>
 ;  V11 loc3         [V11,T04] ( 12, 23   )   byref  ->  x22         single-def
 ;  V12 loc4         [V12,T06] (  6, 19   )     int  ->  x23        
 ;  V13 loc5         [V13,T05] (  5, 20   )   byref  ->  x24        
-;  V14 loc6         [V14    ] (  4, 16   )  simd12  ->  [fp+0x40]   do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector3>
-;  V15 loc7         [V15    ] (  4, 16   )  simd12  ->  [fp+0x30]   do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector3>
-;  V16 loc8         [V16    ] (  1,  4   )   float  ->  [fp+0x2C]   do-not-enreg[X] addr-exposed ld-addr-op
-;  V17 loc9         [V17    ] (  3, 12   )   float  ->  [fp+0x28]   do-not-enreg[X] addr-exposed ld-addr-op
+;  V14 loc6         [V14    ] (  4, 16   )  simd12  ->  [fp+0x60]   do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector3>
+;  V15 loc7         [V15    ] (  4, 16   )  simd12  ->  [fp+0x50]   do-not-enreg[XS] addr-exposed ld-addr-op <System.Numerics.Vector3>
+;  V16 loc8         [V16    ] (  1,  4   )   float  ->  [fp+0x4C]   do-not-enreg[X] addr-exposed ld-addr-op
+;  V17 loc9         [V17    ] (  3, 12   )   float  ->  [fp+0x48]   do-not-enreg[X] addr-exposed ld-addr-op
 ;# V18 OutArgs      [V18    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V19 tmp1         [V19    ] (  0,  0   )     ref  ->  zero-ref    class-hnd "impAppendStmt" <BepuPhysics.Collidables.ShapeBatch>
 ;* V20 tmp2         [V20    ] (  0,  0   )  simd12  ->  zero-ref    "spilled call-like call argument"
@@ -131,7 +131,7 @@
 ;* V120 tmp102      [V120    ] (  0,  0   )     int  ->  zero-ref    "Inline return value spill temp"
 ;* V121 tmp103      [V121    ] (  0,  0   )     int  ->  zero-ref    "Inlining Arg"
 ;  V122 tmp104      [V122,T08] (  3, 12   )     int  ->   x1         "Inlining Arg"
-;  V123 tmp105      [V123    ] (  5, 10   )  struct (16) [fp+0x18]   do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuUtilities.Memory.Buffer`1[ubyte]>
+;  V123 tmp105      [V123    ] (  5, 10   )  struct (16) [fp+0x38]   do-not-enreg[XS] addr-exposed ld-addr-op "Inline ldloca(s) first use temp" <BepuUtilities.Memory.Buffer`1[ubyte]>
 ;  V124 tmp106      [V124,T09] (  4, 16   )     int  ->   x0         "Inlining Arg"
 ;  V125 tmp107      [V125,T32] (  3,  6   )     int  ->   x1        
 ;* V126 tmp108      [V126    ] (  0,  0   )     int  ->  zero-ref    "Inline stloc first use temp"
@@ -156,15 +156,15 @@
 ;* V145 tmp127      [V145    ] (  0,  0   )    long  ->  zero-ref    "field V117.Memory (fldOffset=0x0)" P-INDEP
 ;* V146 tmp128      [V146    ] (  0,  0   )     int  ->  zero-ref    "field V117.length (fldOffset=0x8)" P-INDEP
 ;* V147 tmp129      [V147    ] (  0,  0   )     int  ->  zero-ref    "field V117.Id (fldOffset=0xc)" P-INDEP
-;  V148 tmp130      [V148    ] (  2,  4   )    long  ->  [fp+0x18]   do-not-enreg[X] addr-exposed "field V123.Memory (fldOffset=0x0)" P-DEP
-;  V149 tmp131      [V149    ] (  3,  6   )     int  ->  [fp+0x20]   do-not-enreg[X] addr-exposed "field V123.length (fldOffset=0x8)" P-DEP
-;  V150 tmp132      [V150    ] (  2,  4   )     int  ->  [fp+0x24]   do-not-enreg[X] addr-exposed "field V123.Id (fldOffset=0xc)" P-DEP
+;  V148 tmp130      [V148    ] (  2,  4   )    long  ->  [fp+0x38]   do-not-enreg[X] addr-exposed "field V123.Memory (fldOffset=0x0)" P-DEP
+;  V149 tmp131      [V149    ] (  3,  6   )     int  ->  [fp+0x40]   do-not-enreg[X] addr-exposed "field V123.length (fldOffset=0x8)" P-DEP
+;  V150 tmp132      [V150    ] (  2,  4   )     int  ->  [fp+0x44]   do-not-enreg[X] addr-exposed "field V123.Id (fldOffset=0xc)" P-DEP
 ;  V151 tmp133      [V151,T34] (  3,  6   )    long  ->  x26         "field V129.Memory (fldOffset=0x0)" P-INDEP
 ;  V152 tmp134      [V152,T42] (  2,  4   )     int  ->  x25         "field V129.length (fldOffset=0x8)" P-INDEP
 ;  V153 tmp135      [V153,T43] (  2,  4   )     int  ->  x27         "field V129.Id (fldOffset=0xc)" P-INDEP
-;  V154 tmp136      [V154,T53] (  3,  9   )  simd12  ->  d11         "V10.[000..012)"
+;  V154 tmp136      [V154,T53] (  3,  9   )  simd12  ->  [fp+0x20]   "V10.[000..012)"
 ;  V155 tmp137      [V155,T56] (  2,  5   )   float  ->  d10         "V10.[012..016)"
-;  V156 tmp138      [V156,T51] (  4, 10   )  simd12  ->   d9         "V10.[016..028)"
+;  V156 tmp138      [V156,T51] (  4, 10   )  simd12  ->  [fp+0x10]   "V10.[016..028)"
 ;  V157 tmp139      [V157,T01] (  3, 24   )     ref  ->   x0         "arr expr"
 ;  V158 tmp140      [V158,T03] (  3, 24   )     int  ->   x2         "index expr"
 ;  V159 tmp141      [V159,T02] (  3, 24   )     ref  ->   x0         "argument with side effect"
@@ -179,19 +179,17 @@
 ;  V168 cse4        [V168,T59] (  3,  3   )   simd8  ->  d17         "CSE #01: moderate"
 ;  V169 rat0        [V169,T19] (  3, 12   )     int  ->   x1         "ReplaceWithLclVar is creating a new local variable"
 ;
-; Lcl frame size = 72
+; Lcl frame size = 112
 
 G_M59096_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
-            stp     fp, lr, [sp, #-0xE0]!
-            stp     d8, d9, [sp, #0x58]
-            stp     d10, d11, [sp, #0x68]
-            stp     d12, d13, [sp, #0x78]
-            str     d14, [sp, #0x88]
-            stp     x19, x20, [sp, #0x90]
-            stp     x21, x22, [sp, #0xA0]
-            stp     x23, x24, [sp, #0xB0]
-            stp     x25, x26, [sp, #0xC0]
-            stp     x27, x28, [sp, #0xD0]
+            stp     fp, lr, [sp, #-0xF0]!
+            stp     d8, d9, [sp, #0x80]
+            stp     d10, d11, [sp, #0x90]
+            stp     x19, x20, [sp, #0xA0]
+            stp     x21, x22, [sp, #0xB0]
+            stp     x23, x24, [sp, #0xC0]
+            stp     x25, x26, [sp, #0xD0]
+            stp     x27, x28, [sp, #0xE0]
             mov     fp, sp
             mov     x19, x0
             ; byrRegs +[x19]
@@ -199,7 +197,7 @@ G_M59096_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             ; gcrRegs +[x20]
             mov     x21, x5
             ; gcrRegs +[x21]
-						;; size=56 bbWeight=1 PerfScore 12.00
+						;; size=48 bbWeight=1 PerfScore 10.00
 G_M59096_IG02:        ; bbWeight=1, gcrefRegs=300000 {x20 x21}, byrefRegs=8000E {x1 x2 x3 x19}, byref, isz
             ; byrRegs +[x1-x3]
             ldr     d16, [x1]
@@ -275,31 +273,31 @@ G_M59096_IG03:        ; bbWeight=4, gcrefRegs=300000 {x20 x21}, byrefRegs=480000
             and     w1, w1, #0xD1FFAB1E
             add     x2, x24, #4
             ; byrRegs +[x2]
-            add     x3, fp, #44	// [V16 loc8]
-            add     x4, fp, #40	// [V17 loc9]
-            add     x5, fp, #64	// [V14 loc6]
-            add     x6, fp, #48	// [V15 loc7]
+            add     x3, fp, #76	// [V16 loc8]
+            add     x4, fp, #72	// [V17 loc9]
+            add     x5, fp, #96	// [V14 loc6]
+            add     x6, fp, #80	// [V15 loc7]
             ldr     x7, [x0]
             ldr     x7, [x7, #0x50]
             ldr     x7, [x7]
-            mov     v12.d[0], v9.d[1]
-            mov     v13.d[0], v8.d[1]
-            mov     v14.d[0], v11.d[1]
+            str     z9, [fp, #1, mul vl]	// [V156 tmp138]
+            str     z8, [fp, #7, mul vl]	// [V09 loc1]
+            str     z11, [fp, #2, mul vl]	// [V154 tmp136]
             blr     x7
             ; gcrRegs -[x0]
             ; byrRegs -[x2] +[x24]
-            ldr     q16, [fp, #0x40]	// [V14 loc6]
+            ldr     q16, [fp, #0x60]	// [V14 loc6]
             add     x0, x24, #20
             ; byrRegs +[x0]
             ldr     d17, [x0]
             ldr     w1, [x0, #0x08]
             mov     v17.s[2], w1
             fadd    v16.4s, v16.4s, v17.4s
-            mov     v8.d[1], v13.d[0]
+            ldr     z8, [fp, #7, mul vl]	// [V09 loc1]
             mov     v17.16b, v8.16b
             fsub    v16.4s, v16.4s, v17.4s
-            str     q16, [fp, #0x40]	// [V14 loc6]
-            ldr     q16, [fp, #0x30]	// [V15 loc7]
+            str     q16, [fp, #0x60]	// [V14 loc6]
+            ldr     q16, [fp, #0x50]	// [V15 loc7]
             add     x0, x24, #20
             ldr     d17, [x0]
             ldr     w1, [x0, #0x08]
@@ -307,15 +305,15 @@ G_M59096_IG03:        ; bbWeight=4, gcrefRegs=300000 {x20 x21}, byrefRegs=480000
             fadd    v16.4s, v16.4s, v17.4s
             mov     v17.16b, v8.16b
             fadd    v16.4s, v16.4s, v17.4s
-            str     q16, [fp, #0x30]	// [V15 loc7]
-            ldr     q16, [fp, #0x40]	// [V14 loc6]
-            mov     v9.d[1], v12.d[0]
+            str     q16, [fp, #0x50]	// [V15 loc7]
+            ldr     q16, [fp, #0x60]	// [V14 loc6]
+            ldr     z9, [fp, #1, mul vl]	// [V156 tmp138]
             mov     v17.16b, v9.16b
             fmul    v16.4s, v16.4s, v17.4s
-            mov     v11.d[1], v14.d[0]
+            ldr     z11, [fp, #2, mul vl]	// [V154 tmp136]
             mov     v17.16b, v11.16b
             fsub    v16.4s, v16.4s, v17.4s
-            ldr     q17, [fp, #0x30]	// [V15 loc7]
+            ldr     q17, [fp, #0x50]	// [V15 loc7]
             mov     v18.16b, v9.16b
             fmul    v17.4s, v17.4s, v18.4s
             mov     v18.16b, v11.16b
@@ -343,11 +341,11 @@ G_M59096_IG03:        ; bbWeight=4, gcrefRegs=300000 {x20 x21}, byrefRegs=480000
             movi    v19.4s, #0
             fmax    v16.4s, v16.4s, v19.4s
             fmax    v16.4s, v16.4s, v18.4s
-            str     s16, [fp, #0x28]	// [V17 loc9]
-            ldr     s16, [fp, #0x28]	// [V17 loc9]
+            str     s16, [fp, #0x48]	// [V17 loc9]
+            ldr     s16, [fp, #0x48]	// [V17 loc9]
             fcmp    s16, s17
             bhi     G_M59096_IG14
-						;; size=332 bbWeight=4 PerfScore 550.00
+						;; size=332 bbWeight=4 PerfScore 610.00
 G_M59096_IG04:        ; bbWeight=2, gcrefRegs=300000 {x20 x21}, byrefRegs=480000 {x19 x22}, byref, isz
             ; byrRegs -[x0 x24]
             ldr     w0, [x22, #0x08]
@@ -408,37 +406,37 @@ G_M59096_IG10:        ; bbWeight=2, gcrefRegs=300000 {x20 x21}, byrefRegs=480000
             add     x0, x0, x1
             ; gcrRegs -[x0]
             ; byrRegs +[x0]
-            add     x1, fp, #24	// [V123 tmp105]
+            add     x1, fp, #56	// [V123 tmp105]
             movz    x2, #0xD1FFAB1E      // code for BepuUtilities.Memory.BufferPool+PowerPool:Take(byref):this
             movk    x2, #0xD1FFAB1E LSL #16
             movk    x2, #0xD1FFAB1E LSL #32
             ldr     x2, [x2]
-            mov     v12.d[0], v9.d[1]
-            mov     v13.d[0], v8.d[1]
-            mov     v14.d[0], v11.d[1]
+            str     z9, [fp, #1, mul vl]	// [V156 tmp138]
+            str     z8, [fp, #7, mul vl]	// [V09 loc1]
+            str     z11, [fp, #2, mul vl]	// [V154 tmp136]
             blr     x2
             ; byrRegs -[x0]
-            ldr     w2, [fp, #0x20]	// [V149 tmp131]
+            ldr     w2, [fp, #0x40]	// [V149 tmp131]
             asr     w2, w2, #31
             and     w2, w2, #3
-            ldr     w1, [fp, #0x20]	// [V149 tmp131]
+            ldr     w1, [fp, #0x40]	// [V149 tmp131]
             add     w2, w2, w1
             asr     w25, w2, #2
-            ldr     x26, [fp, #0x18]	// [V148 tmp130]
-            ldr     w27, [fp, #0x24]	// [V150 tmp132]
+            ldr     x26, [fp, #0x38]	// [V148 tmp130]
+            ldr     w27, [fp, #0x44]	// [V150 tmp132]
             ldr     w2, [x22, #0x08]
             cmp     w2, #0
-            mov     v9.d[1], v12.d[0]
-            mov     v8.d[1], v13.d[0]
-            mov     v11.d[1], v14.d[0]
+            ldr     z9, [fp, #1, mul vl]	// [V156 tmp138]
+            ldr     z8, [fp, #7, mul vl]	// [V09 loc1]
+            ldr     z11, [fp, #2, mul vl]	// [V154 tmp136]
             ble     G_M59096_IG12
             lsl     w2, w24, #2
             mov     w2, w2
             ldr     x1, [x22]
             mov     x0, x26
-            mov     v12.d[0], v9.d[1]
-            mov     v13.d[0], v8.d[1]
-            mov     v14.d[0], v11.d[1]
+            str     z9, [fp, #1, mul vl]	// [V156 tmp138]
+            str     z8, [fp, #7, mul vl]	// [V09 loc1]
+            str     z11, [fp, #2, mul vl]	// [V154 tmp136]
             bl      CORINFO_HELP_MEMCPY
             ldr     w1, [x22, #0x0C]
             asr     w0, w1, #26
@@ -463,9 +461,9 @@ G_M59096_IG10:        ; bbWeight=2, gcrefRegs=300000 {x20 x21}, byrefRegs=480000
             ldr     w0, [x0, #0x08]
             ; gcrRegs -[x0]
             cmp     w1, w0
-            mov     v9.d[1], v12.d[0]
-            mov     v8.d[1], v13.d[0]
-            mov     v11.d[1], v14.d[0]
+            ldr     z9, [fp, #1, mul vl]	// [V156 tmp138]
+            ldr     z8, [fp, #7, mul vl]	// [V09 loc1]
+            ldr     z11, [fp, #2, mul vl]	// [V154 tmp136]
             bne     G_M59096_IG11
             lsl     w1, w1, #1
             add     x0, x28, #8
@@ -474,15 +472,15 @@ G_M59096_IG10:        ; bbWeight=2, gcrefRegs=300000 {x20 x21}, byrefRegs=480000
             movk    x2, #0xD1FFAB1E LSL #16
             movk    x2, #0xD1FFAB1E LSL #32
             ldr     x2, [x2]
-            mov     v12.d[0], v9.d[1]
-            mov     v13.d[0], v8.d[1]
-            mov     v14.d[0], v11.d[1]
+            str     z9, [fp, #1, mul vl]	// [V156 tmp138]
+            str     z8, [fp, #7, mul vl]	// [V09 loc1]
+            str     z11, [fp, #2, mul vl]	// [V154 tmp136]
             blr     x2
             ; byrRegs -[x0]
-            mov     v9.d[1], v12.d[0]
-            mov     v8.d[1], v13.d[0]
-            mov     v11.d[1], v14.d[0]
-						;; size=304 bbWeight=2 PerfScore 201.00
+            ldr     z9, [fp, #1, mul vl]	// [V156 tmp138]
...
-8 (-1.05%) : 1624.dasm - BepuPhysics.Collidables.Compound:RayTest[DemoBenchmarks.ShapeRayBenchmarksDeep+HitHandler](byref,byref,byref,BepuPhysics.Collidables.Shapes,byref):this (FullOpts)
@@ -15,11 +15,11 @@
 ;  V04 arg4         [V04,T10] (  3,  6   )     ref  ->  x22         class-hnd single-def <BepuPhysics.Collidables.Shapes>
 ;  V05 arg5         [V05,T08] (  4,  6   )   byref  ->  x20         single-def
 ;* V06 loc0         [V06    ] (  0,  0   )  struct (40) zero-ref    ld-addr-op <BepuUtilities.Matrix3x3>
-;  V07 loc1         [V07    ] (  4,  7   )  struct (32) [fp+0x28]   do-not-enreg[XSF] addr-exposed ld-addr-op <BepuPhysics.Trees.RayData>
+;  V07 loc1         [V07    ] (  4,  7   )  struct (32) [fp+0x50]   do-not-enreg[XSF] addr-exposed ld-addr-op <BepuPhysics.Trees.RayData>
 ;  V08 loc2         [V08,T55] (  2,  2   )  simd12  ->  d16         ld-addr-op <System.Numerics.Vector3>
 ;  V09 loc3         [V09,T03] (  5, 17   )     int  ->  x23        
 ;  V10 loc4         [V10,T04] (  4, 16   )   byref  ->  x26        
-;  V11 loc5         [V11    ] (  7, 22   )  struct (16) [fp+0x18]   do-not-enreg[XSF] addr-exposed ld-addr-op <BepuPhysics.Collidables.CompoundChildShapeTester>
+;  V11 loc5         [V11    ] (  7, 22   )  struct (16) [fp+0x40]   do-not-enreg[XSF] addr-exposed ld-addr-op <BepuPhysics.Collidables.CompoundChildShapeTester>
 ;  V12 loc6         [V12,T39] (  2,  4   )  simd12  ->  d17         ld-addr-op <System.Numerics.Vector3>
 ;# V13 OutArgs      [V13    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;  V14 tmp1         [V14,T00] (  3, 24   )     ref  ->  x27         class-hnd "impAppendStmt" <BepuPhysics.Collidables.ShapeBatch>
@@ -110,9 +110,9 @@
 ;* V99 tmp86        [V99    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
 ;* V100 tmp87       [V100    ] (  0,  0   )  simd12  ->  zero-ref    "Inlining Arg" <System.Numerics.Vector3>
 ;* V101 tmp88       [V101    ] (  0,  0   )   byref  ->  zero-ref    "Inlining Arg"
-;  V102 tmp89       [V102,T36] (  3,  4   )  simd12  ->   d8         "field V06.X (fldOffset=0x0)" P-INDEP
-;  V103 tmp90       [V103,T37] (  3,  4   )  simd12  ->   d9         "field V06.Y (fldOffset=0xc)" P-INDEP
-;  V104 tmp91       [V104,T38] (  3,  4   )  simd12  ->  d10         "field V06.Z (fldOffset=0x18)" P-INDEP
+;  V102 tmp89       [V102,T36] (  3,  4   )  simd12  ->  [fp+0x30]   "field V06.X (fldOffset=0x0)" P-INDEP
+;  V103 tmp90       [V103,T37] (  3,  4   )  simd12  ->  [fp+0x20]   "field V06.Y (fldOffset=0xc)" P-INDEP
+;  V104 tmp91       [V104,T38] (  3,  4   )  simd12  ->  [fp+0x10]   "field V06.Z (fldOffset=0x18)" P-INDEP
 ;  V105 tmp92       [V105,T01] (  3, 24   )     ref  ->   x0         "arr expr"
 ;  V106 tmp93       [V106,T02] (  3, 24   )     int  ->   x1         "index expr"
 ;  V107 cse0        [V107,T32] (  4,  4   )  simd16  ->  d16         "CSE #12: moderate"
@@ -141,18 +141,17 @@
 ;  V130 rat10       [V130,T27] (  3,  6   )  simd16  ->  d16         "ReplaceWithLclVar is creating a new local variable"
 ;  V131 rat11       [V131,T28] (  3,  6   )  simd16  ->  d16         "ReplaceWithLclVar is creating a new local variable"
 ;
-; Lcl frame size = 56
+; Lcl frame size = 96
 
 G_M45600_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, nogc <-- Prolog IG
-            stp     fp, lr, [sp, #-0xC0]!
-            stp     d8, d9, [sp, #0x48]
-            stp     d10, d11, [sp, #0x58]
-            stp     d12, d13, [sp, #0x68]
-            stp     x19, x20, [sp, #0x78]
-            stp     x21, x22, [sp, #0x88]
-            stp     x23, x24, [sp, #0x98]
-            stp     x25, x26, [sp, #0xA8]
-            str     x27, [sp, #0xB8]
+            stp     fp, lr, [sp, #-0xD0]!
+            stp     d8, d9, [sp, #0x70]
+            str     d10, [sp, #0x80]
+            stp     x19, x20, [sp, #0x88]
+            stp     x21, x22, [sp, #0x98]
+            stp     x23, x24, [sp, #0xA8]
+            stp     x25, x26, [sp, #0xB8]
+            str     x27, [sp, #0xC8]
             mov     fp, sp
             mov     x19, x0
             ; byrRegs +[x19]
@@ -162,7 +161,7 @@ G_M45600_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             ; gcrRegs +[x22]
             mov     x20, x5
             ; byrRegs +[x20]
-						;; size=56 bbWeight=1 PerfScore 11.50
+						;; size=52 bbWeight=1 PerfScore 10.50
 G_M45600_IG02:        ; bbWeight=1, gcrefRegs=400000 {x22}, byrefRegs=380006 {x1 x2 x19 x20 x21}, byref, isz
             ; byrRegs +[x1-x2]
             ldp     s16, s17, [x1]
@@ -232,10 +231,10 @@ G_M45600_IG02:        ; bbWeight=1, gcrefRegs=400000 {x22}, byrefRegs=380006 {x1
             faddp   v16.4s, v16.4s, v16.4s
             ins     v18.s[2], v16.s[0]
             ins     v18.s[3], v25.s[0]
-            str     d18, [fp, #0x28]	// [V07 loc1]
+            str     d18, [fp, #0x50]	// [V07 loc1]
             mov     w0, v18.s[2]
             ; byrRegs -[x0]
-            str     w0, [fp, #0x30]	// [V07 loc1+0x08]
+            str     w0, [fp, #0x58]	// [V07 loc1+0x08]
             add     x0, x2, #16
             ; byrRegs +[x0]
             ldr     d16, [x0]
@@ -254,11 +253,11 @@ G_M45600_IG02:        ; bbWeight=1, gcrefRegs=400000 {x22}, byrefRegs=380006 {x1
             faddp   v16.4s, v16.4s, v16.4s
             ins     v17.s[2], v16.s[0]
             ins     v17.s[3], v25.s[0]
-            str     d17, [fp, #0x38]	// [V07 loc1+0x10]
+            str     d17, [fp, #0x60]	// [V07 loc1+0x10]
             mov     w0, v17.s[2]
             ; byrRegs -[x0]
-            str     w0, [fp, #0x40]	// [V07 loc1+0x18]
-            str     wzr, [fp, #0x34]	// [V07 loc1+0x0c]
+            str     w0, [fp, #0x68]	// [V07 loc1+0x18]
+            str     wzr, [fp, #0x5C]	// [V07 loc1+0x0c]
             mov     w23, wzr
             ldr     w0, [x19, #0x08]
             cmp     w0, #0
@@ -278,9 +277,9 @@ G_M45600_IG04:        ; bbWeight=4, gcrefRegs=400000 {x22}, byrefRegs=380000 {x1
             mov     w1, #36
             smaddl  x26, w23, w1, x0
             fmov    s16, #-1.0000
-            str     s16, [fp, #0x18]	// [V11 loc5]
-            str     xzr, [fp, #0x1C]	// [V11 loc5+0x04]
-            str     wzr, [fp, #0x24]	// [V11 loc5+0x0c]
+            str     s16, [fp, #0x40]	// [V11 loc5]
+            str     xzr, [fp, #0x44]	// [V11 loc5+0x04]
+            str     wzr, [fp, #0x4C]	// [V11 loc5+0x0c]
             ldr     x0, [x22, #0x08]
             ; gcrRegs +[x0]
             ldr     w1, [x26]
@@ -299,9 +298,9 @@ G_M45600_IG04:        ; bbWeight=4, gcrefRegs=400000 {x22}, byrefRegs=380000 {x1
             ; byrRegs -[x0]
             mov     x1, x24
             mov     x2, x25
-            mov     v11.d[0], v8.d[1]
-            mov     v12.d[0], v9.d[1]
-            mov     v13.d[0], v10.d[1]
+            str     z8, [fp, #3, mul vl]	// [V102 tmp89]
+            str     z9, [fp, #2, mul vl]	// [V103 tmp90]
+            str     z10, [fp, #1, mul vl]	// [V104 tmp91]
             bl      CORINFO_HELP_VIRTUAL_FUNC_PTR
             ; gcrRegs -[x0]
             ; byrRegs +[x26]
@@ -310,8 +309,8 @@ G_M45600_IG04:        ; bbWeight=4, gcrefRegs=400000 {x22}, byrefRegs=380000 {x1
             and     w1, w1, #0xD1FFAB1E
             add     x2, x26, #4
             ; byrRegs +[x2]
-            add     x5, fp, #24	// [V11 loc5]
-            add     x3, fp, #40	// [V07 loc1]
+            add     x5, fp, #64	// [V11 loc5]
+            add     x3, fp, #80	// [V07 loc1]
             mov     x0, x27
             ; gcrRegs +[x0]
             mov     x4, x21
@@ -319,25 +318,25 @@ G_M45600_IG04:        ; bbWeight=4, gcrefRegs=400000 {x22}, byrefRegs=380000 {x1
             blr     x6
             ; gcrRegs -[x0 x27]
             ; byrRegs -[x2 x4 x26]
-            ldr     s16, [fp, #0x18]	// [V11 loc5]
+            ldr     s16, [fp, #0x40]	// [V11 loc5]
             fcmp    s16, #0.0
-            mov     v8.d[1], v11.d[0]
-            mov     v9.d[1], v12.d[0]
-            mov     v10.d[1], v13.d[0]
+            ldr     z8, [fp, #3, mul vl]	// [V102 tmp89]
+            ldr     z9, [fp, #2, mul vl]	// [V103 tmp90]
+            ldr     z10, [fp, #1, mul vl]	// [V104 tmp91]
             blt     G_M45600_IG06
-						;; size=152 bbWeight=4 PerfScore 194.00
+						;; size=152 bbWeight=4 PerfScore 254.00
 G_M45600_IG05:        ; bbWeight=2, gcrefRegs=400000 {x22}, byrefRegs=380000 {x19 x20 x21}, byref
             mov     v17.16b, v8.16b
-            ldr     s18, [fp, #0x1C]	// [V11 loc5+0x04]
+            ldr     s18, [fp, #0x44]	// [V11 loc5+0x04]
             dup     v18.4s, v18.s[0]
             fmul    v17.4s, v17.4s, v18.4s
             mov     v18.16b, v9.16b
-            ldr     s19, [fp, #0x20]	// [V11 loc5+0x08]
+            ldr     s19, [fp, #0x48]	// [V11 loc5+0x08]
             dup     v19.4s, v19.s[0]
             fmul    v18.4s, v18.4s, v19.4s
             fadd    v17.4s, v17.4s, v18.4s
             mov     v18.16b, v10.16b
-            ldr     s19, [fp, #0x24]	// [V11 loc5+0x0c]
+            ldr     s19, [fp, #0x4C]	// [V11 loc5+0x0c]
             dup     v19.4s, v19.s[0]
             fmul    v18.4s, v18.4s, v19.4s
             fadd    v17.4s, v17.4s, v18.4s
@@ -360,23 +359,22 @@ G_M45600_IG06:        ; bbWeight=4, gcrefRegs=400000 {x22}, byrefRegs=380000 {x1
 G_M45600_IG07:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, epilog, nogc
             ; gcrRegs -[x22]
             ; byrRegs -[x19-x21]
-            ldr     x27, [sp, #0xB8]
-            ldp     x25, x26, [sp, #0xA8]
-            ldp     x23, x24, [sp, #0x98]
-            ldp     x21, x22, [sp, #0x88]
-            ldp     x19, x20, [sp, #0x78]
-            ldp     d12, d13, [sp, #0x68]
-            ldp     d10, d11, [sp, #0x58]
-            ldp     d8, d9, [sp, #0x48]
-            ldp     fp, lr, [sp], #0xC0
+            ldr     x27, [sp, #0xC8]
+            ldp     x25, x26, [sp, #0xB8]
+            ldp     x23, x24, [sp, #0xA8]
+            ldp     x21, x22, [sp, #0x98]
+            ldp     x19, x20, [sp, #0x88]
+            ldr     d10, [sp, #0x80]
+            ldp     d8, d9, [sp, #0x70]
+            ldp     fp, lr, [sp], #0xD0
             ret     lr
-						;; size=40 bbWeight=1 PerfScore 11.00
+						;; size=36 bbWeight=1 PerfScore 11.00
 G_M45600_IG08:        ; bbWeight=0, gcVars=0000000000000000 {}, gcrefRegs=0000 {}, byrefRegs=0000 {}, gcvars, byref
             bl      CORINFO_HELP_RNGCHKFAIL
             brk     #0
 						;; size=8 bbWeight=0 PerfScore 0.00
 
-; Total bytes of code 760, prolog size 40, PerfScore 514.25, instruction count 190, allocated bytes for code 760 (MethodHash=b6c44ddf) for method BepuPhysics.Collidables.Compound:RayTest[DemoBenchmarks.ShapeRayBenchmarksDeep+HitHandler](byref,byref,byref,BepuPhysics.Collidables.Shapes,byref):this (FullOpts)
+; Total bytes of code 752, prolog size 36, PerfScore 573.25, instruction count 188, allocated bytes for code 752 (MethodHash=b6c44ddf) for method BepuPhysics.Collidables.Compound:RayTest[DemoBenchmarks.ShapeRayBenchmarksDeep+HitHandler](byref,byref,byref,BepuPhysics.Collidables.Shapes,byref):this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -387,7 +385,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 190 (0x000be) Actual length = 760 (0x0002f8)
+  Function Length   : 188 (0x000bc) Actual length = 752 (0x0002f0)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
@@ -395,15 +393,14 @@ Unwind Info:
   ---- Unwind codes ----
     E1          set_fp; mov fp, sp
     ---- Epilog start at index 1 ----
-    D2 17       save_reg X#8 Z#23 (0x17); str x27, [sp, #184]
+    D2 19       save_reg X#8 Z#25 (0x19); str x27, [sp, #200]
     E6          save_next
     E6          save_next
     E6          save_next
-    C8 0F       save_regp X#0 Z#15 (0x0F); stp x19, x20, [sp, #120]
-    E6          save_next
-    E6          save_next
-    D8 09       save_fregp X#0 Z#9 (0x09); stp d8, d9, [sp, #72]
-    97          save_fplr_x #23 (0x17); stp fp, lr, [sp, #-192]!
+    C8 11       save_regp X#0 Z#17 (0x11); stp x19, x20, [sp, #136]
+    DC 90       save_freg X#2 Z#16 (0x10); str d10, [sp, #128]
+    D8 0E       save_fregp X#0 Z#14 (0x0E); stp d8, d9, [sp, #112]
+    99          save_fplr_x #25 (0x19); stp fp, lr, [sp, #-208]!
     E4          end
     E4          end
     E4          end
+8 (+33.33%) : 1686.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Capsule():BepuPhysics.Collidables.CapsuleWide:this (FullOpts)
@@ -13,8 +13,8 @@
 ;# V02 OutArgs      [V02    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V03 tmp1         [V03    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[float]>
 ;* V04 tmp2         [V04    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[float]>
-;* V05 tmp3         [V05,T00] (  0,  0   )  simd16  ->  zero-ref    "field V01.Radius (fldOffset=0x0)" P-INDEP
-;* V06 tmp4         [V06,T01] (  0,  0   )  simd16  ->  zero-ref    "field V01.HalfLength (fldOffset=0x10)" P-INDEP
+;  V05 tmp3         [V05,T00] (  2,  2   )  simd16  ->   d0         "field V01.Radius (fldOffset=0x0)" P-INDEP
+;  V06 tmp4         [V06,T01] (  2,  2   )  simd16  ->   d1         "field V01.HalfLength (fldOffset=0x10)" P-INDEP
 ;
 ; Lcl frame size = 0
 
@@ -22,18 +22,18 @@ G_M13019_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
             stp     fp, lr, [sp, #-0x10]!
             mov     fp, sp
 						;; size=8 bbWeight=1 PerfScore 1.50
-G_M13019_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref, isz
-            movi    v0.4s, #0x3F,  LSL #24
-            ldr     q1, [@RWD00]
-						;; size=8 bbWeight=1 PerfScore 2.50
+G_M13019_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref
+            fmov    s1, #0.5000
+            mov     z0.s, s1
+            fmov    s1, #1.0000
+            mov     z1.s, s1
+						;; size=16 bbWeight=1 PerfScore 5.00
 G_M13019_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
-RWD00  	dq	3F8000003F800000h, 3F8000003F800000h
 
-
-; Total bytes of code 24, prolog size 8, PerfScore 6.00, instruction count 6, allocated bytes for code 24 (MethodHash=ca32cd24) for method DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Capsule():BepuPhysics.Collidables.CapsuleWide:this (FullOpts)
+; Total bytes of code 32, prolog size 8, PerfScore 8.50, instruction count 8, allocated bytes for code 32 (MethodHash=ca32cd24) for method DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Capsule():BepuPhysics.Collidables.CapsuleWide:this (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -44,7 +44,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 6 (0x00006) Actual length = 24 (0x000018)
+  Function Length   : 8 (0x00008) Actual length = 32 (0x000020)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
+24 (+31.58%) : 955.dasm - BepuUtilities.Vector3Wide:ConditionalSelect(byref,byref,byref,byref) (FullOpts)
@@ -22,27 +22,33 @@ G_M12356_IG01:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=0000 {}, byref,
 G_M12356_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=000F {x0 x1 x2 x3}, byref
             ; byrRegs +[x0-x3]
             ldr     q16, [x0]
-            ldr     q17, [x1]
-            ldr     q18, [x2]
-            bsl     v16.4s, v17.4s, v18.4s
+            ptrue   p0.s
+            cmpne   p0.s, p0/z, z16.s, #0
+            ldr     q16, [x1]
+            ldr     q17, [x2]
+            sel     z16.s, p0, z16.s, z17.s
             str     q16, [x3]
             ldr     q16, [x0]
-            ldr     q17, [x1, #0x10]
-            ldr     q18, [x2, #0x10]
-            bsl     v16.4s, v17.4s, v18.4s
+            ptrue   p0.s
+            cmpne   p0.s, p0/z, z16.s, #0
+            ldr     q16, [x1, #0x10]
+            ldr     q17, [x2, #0x10]
+            sel     z16.s, p0, z16.s, z17.s
             str     q16, [x3, #0x10]
             ldr     q16, [x0]
-            ldr     q17, [x1, #0x20]
-            ldr     q18, [x2, #0x20]
-            bsl     v16.4s, v17.4s, v18.4s
+            ptrue   p0.s
+            cmpne   p0.s, p0/z, z16.s, #0
+            ldr     q16, [x1, #0x20]
+            ldr     q17, [x2, #0x20]
+            sel     z16.s, p0, z16.s, z17.s
             str     q16, [x3, #0x20]
-						;; size=60 bbWeight=1 PerfScore 31.50
+						;; size=84 bbWeight=1 PerfScore 51.00
 G_M12356_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
 
-; Total bytes of code 76, prolog size 8, PerfScore 35.00, instruction count 19, allocated bytes for code 76 (MethodHash=619ecfbb) for method BepuUtilities.Vector3Wide:ConditionalSelect(byref,byref,byref,byref) (FullOpts)
+; Total bytes of code 100, prolog size 8, PerfScore 54.50, instruction count 25, allocated bytes for code 100 (MethodHash=619ecfbb) for method BepuUtilities.Vector3Wide:ConditionalSelect(byref,byref,byref,byref) (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -53,7 +59,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 19 (0x00013) Actual length = 76 (0x00004c)
+  Function Length   : 25 (0x00019) Actual length = 100 (0x000064)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
+44 (+20.00%) : 1662.dasm - BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref) (FullOpts)
@@ -11,35 +11,36 @@
 ;  V00 arg0         [V00,T00] (  5,  5   )   byref  ->   x0         single-def
 ;  V01 arg1         [V01,T01] (  5,  5   )   byref  ->   x1         single-def
 ;  V02 arg2         [V02,T02] (  5,  5   )   byref  ->   x2         single-def
-;  V03 arg3         [V03,T04] (  3,  3   )   byref  ->   x3         single-def
-;  V04 arg4         [V04,T05] (  3,  3   )   byref  ->   x4         single-def
-;  V05 arg5         [V05,T06] (  3,  3   )   byref  ->   x5         single-def
+;  V03 arg3         [V03,T05] (  3,  3   )   byref  ->   x3         single-def
+;  V04 arg4         [V04,T06] (  3,  3   )   byref  ->   x4         single-def
+;  V05 arg5         [V05,T07] (  3,  3   )   byref  ->   x5         single-def
 ;  V06 arg6         [V06,T03] (  5,  5   )   byref  ->   x6         single-def
-;  V07 loc0         [V07,T07] (  7,  7   )  simd16  ->  d16         ld-addr-op <System.Numerics.Vector`1[float]>
-;  V08 loc1         [V08,T08] (  7,  7   )  simd16  ->  d17         ld-addr-op <System.Numerics.Vector`1[float]>
-;  V09 loc2         [V09,T09] (  7,  7   )  simd16  ->  d19         ld-addr-op <System.Numerics.Vector`1[float]>
+;  V07 loc0         [V07,T08] (  7,  7   )  simd16  ->  d16         ld-addr-op <System.Numerics.Vector`1[float]>
+;  V08 loc1         [V08,T09] (  7,  7   )  simd16  ->  d18         ld-addr-op <System.Numerics.Vector`1[float]>
+;  V09 loc2         [V09,T10] (  7,  7   )  simd16  ->  d19         ld-addr-op <System.Numerics.Vector`1[float]>
 ;  V10 loc3         [V10,T13] (  3,  3   )  simd16  ->  d20         <System.Numerics.Vector`1[float]>
-;  V11 loc4         [V11,T11] (  5,  5   )  simd16  ->  d21         <System.Numerics.Vector`1[int]>
-;  V12 loc5         [V12,T10] (  7,  7   )  simd16  ->  d18         <System.Numerics.Vector`1[float]>
+;  V11 loc4         [V11,T14] (  2,  2   )  simd16  ->  d21         <System.Numerics.Vector`1[int]>
+;  V12 loc5         [V12,T11] (  7,  7   )  simd16  ->  d17         <System.Numerics.Vector`1[float]>
 ;* V13 loc6         [V13    ] (  0,  0   )  struct (48) zero-ref    ld-addr-op <BepuUtilities.Vector3Wide>
 ;* V14 loc7         [V14    ] (  0,  0   )  struct (48) zero-ref    ld-addr-op <BepuUtilities.Vector3Wide>
 ;* V15 loc8         [V15    ] (  0,  0   )  struct (48) zero-ref    ld-addr-op <BepuUtilities.Vector3Wide>
 ;* V16 loc9         [V16    ] (  0,  0   )  struct (48) zero-ref    ld-addr-op <BepuUtilities.Vector3Wide>
 ;# V17 OutArgs      [V17    ] (  1,  1   )  struct ( 0) [sp+0x00]   do-not-enreg[XS] addr-exposed "OutgoingArgSpace" <Empty>
 ;* V18 tmp1         [V18    ] (  0,  0   )  simd16  ->  zero-ref    ld-addr-op "NewObj constructor temp" <System.Numerics.Vector`1[float]>
-;  V19 tmp2         [V19,T14] (  2,  2   )  simd16  ->  d18         "field V13.X (fldOffset=0x0)" P-INDEP
-;  V20 tmp3         [V20,T15] (  2,  2   )  simd16  ->  d20         "field V13.Y (fldOffset=0x10)" P-INDEP
-;  V21 tmp4         [V21,T16] (  2,  2   )  simd16  ->  d16         "field V13.Z (fldOffset=0x20)" P-INDEP
-;  V22 tmp5         [V22,T17] (  2,  2   )  simd16  ->  d21         "field V14.X (fldOffset=0x0)" P-INDEP
-;  V23 tmp6         [V23,T18] (  2,  2   )  simd16  ->  d22         "field V14.Y (fldOffset=0x10)" P-INDEP
-;  V24 tmp7         [V24,T19] (  2,  2   )  simd16  ->  d17         "field V14.Z (fldOffset=0x20)" P-INDEP
-;  V25 tmp8         [V25,T20] (  2,  2   )  simd16  ->  d23         "field V15.X (fldOffset=0x0)" P-INDEP
-;  V26 tmp9         [V26,T21] (  2,  2   )  simd16  ->  d24         "field V15.Y (fldOffset=0x10)" P-INDEP
-;  V27 tmp10        [V27,T22] (  2,  2   )  simd16  ->  d19         "field V15.Z (fldOffset=0x20)" P-INDEP
-;  V28 tmp11        [V28,T23] (  2,  2   )  simd16  ->  d18         "field V16.X (fldOffset=0x0)" P-INDEP
-;  V29 tmp12        [V29,T24] (  2,  2   )  simd16  ->  d20         "field V16.Y (fldOffset=0x10)" P-INDEP
-;  V30 tmp13        [V30,T25] (  2,  2   )  simd16  ->  d16         "field V16.Z (fldOffset=0x20)" P-INDEP
-;  V31 cse0         [V31,T12] (  5,  5   )  simd16  ->  d18         "CSE #01: aggressive"
+;  V19 tmp2         [V19,T15] (  2,  2   )  simd16  ->  d17         "field V13.X (fldOffset=0x0)" P-INDEP
+;  V20 tmp3         [V20,T16] (  2,  2   )  simd16  ->  d20         "field V13.Y (fldOffset=0x10)" P-INDEP
+;  V21 tmp4         [V21,T17] (  2,  2   )  simd16  ->  d16         "field V13.Z (fldOffset=0x20)" P-INDEP
+;  V22 tmp5         [V22,T18] (  2,  2   )  simd16  ->  d21         "field V14.X (fldOffset=0x0)" P-INDEP
+;  V23 tmp6         [V23,T19] (  2,  2   )  simd16  ->  d22         "field V14.Y (fldOffset=0x10)" P-INDEP
+;  V24 tmp7         [V24,T20] (  2,  2   )  simd16  ->  d18         "field V14.Z (fldOffset=0x20)" P-INDEP
+;  V25 tmp8         [V25,T21] (  2,  2   )  simd16  ->  d23         "field V15.X (fldOffset=0x0)" P-INDEP
+;  V26 tmp9         [V26,T22] (  2,  2   )  simd16  ->  d24         "field V15.Y (fldOffset=0x10)" P-INDEP
+;  V27 tmp10        [V27,T23] (  2,  2   )  simd16  ->  d19         "field V15.Z (fldOffset=0x20)" P-INDEP
+;  V28 tmp11        [V28,T24] (  2,  2   )  simd16  ->  d17         "field V16.X (fldOffset=0x0)" P-INDEP
+;  V29 tmp12        [V29,T25] (  2,  2   )  simd16  ->  d20         "field V16.Y (fldOffset=0x10)" P-INDEP
+;  V30 tmp13        [V30,T26] (  2,  2   )  simd16  ->  d16         "field V16.Z (fldOffset=0x20)" P-INDEP
+;  V31 cse0         [V31,T12] (  5,  5   )  simd16  ->  d17         "CSE #01: aggressive"
+;  V32 cse1         [V32,T04] (  5,  5   )    mask  ->   p0         "CSE #02: aggressive"
 ;
 ; Lcl frame size = 0
 
@@ -51,65 +52,75 @@ G_M6513_IG02:        ; bbWeight=1, gcrefRegs=0000 {}, byrefRegs=007F {x0 x1 x2 x
             ; byrRegs +[x0-x6]
             ldr     q16, [x3]
             movi    v17.4s, #0
-            cmgt    v16.4s, v17.4s, v16.4s
-            movi    v17.4s, #0
-            ldr     q18, [@RWD00]
-            bsl     v16.4s, v17.4s, v18.4s
-            ldr     q17, [x4]
+            ptrue   p0.s
+            cmpgt   p0.s, p0/z, z17.s, z16.s
+            movi    v16.4s, #0
+            ldr     q17, [@RWD00]
+            sel     z16.s, p0, z16.s, z17.s
+            ldr     q18, [x4]
             movi    v19.4s, #0
-            cmgt    v17.4s, v19.4s, v17.4s
-            movi    v19.4s, #0
-            bsl     v17.4s, v19.4s, v18.4s
+            ptrue   p0.s
+            cmpgt   p0.s, p0/z, z19.s, z18.s
+            movi    v18.4s, #0
+            sel     z18.s, p0, z18.s, z17.s
             ldr     q19, [x5]
             movi    v20.4s, #0
-            cmgt    v19.4s, v20.4s, v19.4s
-            movi    v20.4s, #0
-            bsl     v19.4s, v20.4s, v18.4s
-            fadd    v20.4s, v16.4s, v17.4s
-            fadd    v20.4s, v20.4s, v19.4s
-            cmeq    v21.4s, v20.4s, #0
-            ldr     q22, [@RWD16]
-            bit     v20.4s, v22.4s, v21.4s
-            fdiv    v18.4s, v18.4s, v20.4s
-            fmul    v16.4s, v16.4s, v18.4s
-            bit     v16.4s, v18.4s, v21.4s
-            fmul    v17.4s, v17.4s, v18.4s
-            bit     v17.4s, v18.4s, v21.4s
-            fmul    v19.4s, v19.4s, v18.4s
-            bit     v19.4s, v18.4s, v21.4s
-            ldp     q18, q20, [x0]
-            fmul    v18.4s, v18.4s, v16.4s
-            fmul    v20.4s, v20.4s, v16.4s
+            ptrue   p0.s
+            cmpgt   p0.s, p0/z, z20.s, z19.s
+            movi    v19.4s, #0
+            sel     z19.s, p0, z19.s, z17.s
+            fadd    z20.s, z16.s, z18.s
+            fadd    z20.s, z20.s, z19.s
+            movi    v21.4s, #0
+            ptrue   p0.s
+            cmpeq   p0.s, p0/z, z20.s, z21.s
+            mov     z21.s, p0/z, #1
+            ptrue   p0.s
+            cmpne   p0.s, p0/z, z21.s, #0
+            fmov    s21, #3.0000
+            mov     z21.s, s21
+            mov     z20.s, p0/m, z21.s
+            ptrue   p1.s
+            movprfx z17.s, p1/z, z17.s
+            fdiv    z17.s, p1/m, z17.s, z20.s
+            fmul    z16.s, z16.s, z17.s
+            mov     z16.s, p0/m, z17.s
+            fmul    z18.s, z18.s, z17.s
+            mov     z18.s, p0/m, z17.s
+            fmul    z19.s, z19.s, z17.s
+            mov     z19.s, p0/m, z17.s
+            ldp     q17, q20, [x0]
+            fmul    z17.s, z17.s, z16.s
+            fmul    z20.s, z20.s, z16.s
             ldr     q21, [x0, #0x20]
-            fmul    v16.4s, v21.4s, v16.4s
+            fmul    z16.s, z21.s, z16.s
             ldp     q21, q22, [x1]
-            fmul    v21.4s, v21.4s, v17.4s
-            fmul    v22.4s, v22.4s, v17.4s
+            fmul    z21.s, z21.s, z18.s
+            fmul    z22.s, z22.s, z18.s
             ldr     q23, [x1, #0x20]
-            fmul    v17.4s, v23.4s, v17.4s
+            fmul    z18.s, z23.s, z18.s
             ldp     q23, q24, [x2]
-            fmul    v23.4s, v23.4s, v19.4s
-            fmul    v24.4s, v24.4s, v19.4s
+            fmul    z23.s, z23.s, z19.s
+            fmul    z24.s, z24.s, z19.s
             ldr     q25, [x2, #0x20]
-            fmul    v19.4s, v25.4s, v19.4s
-            fadd    v18.4s, v18.4s, v21.4s
-            fadd    v20.4s, v20.4s, v22.4s
-            fadd    v16.4s, v16.4s, v17.4s
-            fadd    v17.4s, v18.4s, v23.4s
-            fadd    v18.4s, v20.4s, v24.4s
+            fmul    z19.s, z25.s, z19.s
+            fadd    z17.s, z17.s, z21.s
+            fadd    z20.s, z20.s, z22.s
+            fadd    z16.s, z16.s, z18.s
+            fadd    z17.s, z17.s, z23.s
+            fadd    z18.s, z20.s, z24.s
             stp     q17, q18, [x6]
-            fadd    v16.4s, v16.4s, v19.4s
+            fadd    z16.s, z16.s, z19.s
             str     q16, [x6, #0x20]
-						;; size=204 bbWeight=1 PerfScore 115.50
+						;; size=248 bbWeight=1 PerfScore 136.00
 G_M6513_IG03:        ; bbWeight=1, epilog, nogc, extend
             ldp     fp, lr, [sp], #0x10
             ret     lr
 						;; size=8 bbWeight=1 PerfScore 2.00
 RWD00  	dq	3F8000003F800000h, 3F8000003F800000h
-RWD16  	dq	4040000040400000h, 4040000040400000h
 
 
-; Total bytes of code 220, prolog size 8, PerfScore 119.00, instruction count 55, allocated bytes for code 220 (MethodHash=655ce68e) for method BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref) (FullOpts)
+; Total bytes of code 264, prolog size 8, PerfScore 139.50, instruction count 66, allocated bytes for code 264 (MethodHash=655ce68e) for method BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref) (FullOpts)
 ; ============================================================
 
 Unwind Info:
@@ -120,7 +131,7 @@ Unwind Info:
   E bit             : 0
   X bit             : 0
   Vers              : 0
-  Function Length   : 55 (0x00037) Actual length = 220 (0x0000dc)
+  Function Length   : 66 (0x00042) Actual length = 264 (0x000108)
   ---- Epilog scopes ----
   ---- Scope 0
   Epilog Start Offset        : 3523193630 (0xd1ffab1e) Actual offset = 3523193630 (0xd1ffab1e) Offset from main function begin = 3523193630 (0xd1ffab1e)
Details

Size improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same size Improvements (bytes) Regressions (bytes)
realworld.run.windows.arm64.checked.mch 150 10 74 66 -324 +2,292

PerfScore improvements/regressions per collection

Collection Contexts with diffs Improvements Regressions Same PerfScore Improvements (PerfScore) Regressions (PerfScore) PerfScore Overall in FullOpts
realworld.run.windows.arm64.checked.mch 150 58 85 7 -14.31% +10.66% -0.0012%

Context information

Collection Diffed contexts MinOpts FullOpts Missed, base Missed, diff
realworld.run.windows.arm64.checked.mch 29,064 3 29,061 0 (0.00%) 102 (0.35%)

jit-analyze output

realworld.run.windows.arm64.checked.mch

Summary of Code Size diffs:
(Lower is better)

Total bytes of base: 14279912 (overridden on cmd)
Total bytes of diff: 14281880 (overridden on cmd)
Total bytes of delta: 1968 (0.01 % of base)
    diff is a regression.
    relative diff is a regression.
Detail diffs


Top file regressions (bytes):
         220 : 953.dasm (20.45% of base)
         196 : 1005.dasm (23.90% of base)
         160 : 2974.dasm (8.40% of base)
         136 : 2526.dasm (3.59% of base)
          80 : 1632.dasm (0.67% of base)
          72 : 1007.dasm (22.50% of base)
          68 : 1216.dasm (19.10% of base)
          64 : 1235.dasm (6.11% of base)
          64 : 1709.dasm (7.51% of base)
          60 : 1716.dasm (9.62% of base)
          56 : 986.dasm (28.57% of base)
          56 : 1710.dasm (10.14% of base)
          48 : 1696.dasm (4.58% of base)
          48 : 980.dasm (6.78% of base)
          48 : 1666.dasm (16.67% of base)
          44 : 1662.dasm (20.00% of base)
          36 : 1190.dasm (4.37% of base)
          36 : 1676.dasm (2.46% of base)
          36 : 1669.dasm (2.93% of base)
          32 : 1353.dasm (2.02% of base)

Top file improvements (bytes):
         -84 : 767.dasm (-3.54% of base)
         -84 : 1170.dasm (-3.54% of base)
         -44 : 1683.dasm (-1.90% of base)
         -36 : 1645.dasm (-0.35% of base)
         -24 : 1643.dasm (-0.24% of base)
         -16 : 1242.dasm (-1.40% of base)
         -12 : 1630.dasm (-0.64% of base)
          -8 : 1183.dasm (-0.53% of base)
          -8 : 1624.dasm (-1.05% of base)
          -8 : 1254.dasm (-0.73% of base)

40 total files with Code Size differences (10 improved, 30 regressed), 28 unchanged.

Top method regressions (bytes):
         220 (20.45% of base) : 953.dasm - BepuPhysics.CollisionDetection.CollisionTasks.ManifoldCandidateHelper:Reduce(byref,System.Numerics.Vector`1[int],int,byref,System.Numerics.Vector`1[float],byref,byref,byref,System.Numerics.Vector`1[float],System.Numerics.Vector`1[float],int,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
         196 (23.90% of base) : 1005.dasm - BepuPhysics.CollisionDetection.CollisionTasks.ManifoldCandidateHelper:ReduceWithoutComputingDepths(byref,System.Numerics.Vector`1[int],int,System.Numerics.Vector`1[float],System.Numerics.Vector`1[float],int,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
         160 ( 8.40% of base) : 2974.dasm - SixLabors.ImageSharp.Processing.Processors.Convolution.KernelSamplingMap:BuildSamplingOffsetMap(int,int,SixLabors.ImageSharp.Rectangle):this (FullOpts)
         136 ( 3.59% of base) : 2526.dasm - SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.JpegComponentPostProcessor:CopyBlocksToColorBuffer(int):this (FullOpts)
          80 ( 0.67% of base) : 1632.dasm - DemoBenchmarks.RagdollTubeBenchmark:AddRagdoll(System.Numerics.Vector3,System.Numerics.Quaternion,int,BepuPhysics.CollidableProperty`1[DemoBenchmarks.RagdollTubeBenchmark+SubgroupCollisionFilter],BepuPhysics.Simulation):DemoBenchmarks.RagdollTubeBenchmark+RagdollHandles (FullOpts)
          72 (22.50% of base) : 1007.dasm - BepuPhysics.CollisionDetection.CollisionTasks.BoxPairTester:ClipBoxBEdgeAgainstBoxAFace(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          68 (19.10% of base) : 1216.dasm - BepuUtilities.MathHelper:Acos(System.Numerics.Vector`1[float]):System.Numerics.Vector`1[float] (FullOpts)
          64 ( 6.11% of base) : 1235.dasm - BepuPhysics.Constraints.AreaConstraintFunctions:Solve(byref,byref,byref,byref,byref,byref,byref,byref,byref,float,float,byref,byref,byref,byref,byref):this (FullOpts)
          64 ( 7.51% of base) : 1709.dasm - BepuPhysics.Constraints.VolumeConstraintFunctions:Solve(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,byref,float,float,byref,byref,byref,byref,byref,byref):this (FullOpts)
          60 ( 9.62% of base) : 1716.dasm - BepuPhysics.Constraints.CenterDistanceConstraintFunctions:Solve(byref,byref,byref,byref,byref,byref,float,float,byref,byref,byref,byref):this (FullOpts)
          56 (28.57% of base) : 986.dasm - BepuPhysics.CollisionDetection.CollisionTasks.BoxTriangleTester:ClipTriangleEdgeAgainstPlanes(byref,byref,byref,byref,byref,byref) (FullOpts)
          56 (10.14% of base) : 1710.dasm - DemoBenchmarks.GroupedCollisionTesterBenchmarks:CheapCollisionBenchmarks():System.Numerics.Vector`1[float]:this (FullOpts)
          48 ( 6.78% of base) : 980.dasm - BepuPhysics.CollisionDetection.CollisionTasks.BoxTriangleTester:TestBoxEdgesAgainstTriangleEdge(byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          48 (16.67% of base) : 1666.dasm - BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          48 ( 4.58% of base) : 1696.dasm - BepuUtilities.Symmetric6x6Wide:LDLTSolve(byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          44 (20.00% of base) : 1662.dasm - BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          36 ( 2.93% of base) : 1669.dasm - BepuPhysics.Constraints.AngularMotorFunctions:Solve(byref,byref,byref,byref,byref,byref,float,float,byref,byref,byref,byref):this (FullOpts)
          36 ( 2.46% of base) : 1676.dasm - BepuPhysics.Constraints.OneBodyLinearMotorFunctions:Solve(byref,byref,byref,float,float,byref,byref,byref):this (FullOpts)
          36 ( 4.37% of base) : 1190.dasm - BepuPhysics.PoseIntegration:Integrate(byref,byref,byref,byref) (FullOpts)
          32 ( 2.02% of base) : 1353.dasm - BepuPhysics.Trees.Tree:Add(byref,BepuUtilities.Memory.BufferPool):int:this (FullOpts)

Top method improvements (bytes):
         -84 (-3.54% of base) : 767.dasm - DemoBenchmarks.CollisionBatcherTaskBenchmarks:Setup():this (FullOpts)
         -84 (-3.54% of base) : 1170.dasm - DemoBenchmarks.Sweeper:.ctor():this (FullOpts)
         -44 (-1.90% of base) : 1683.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:Setup():this (FullOpts)
         -36 (-0.35% of base) : 1645.dasm - DemoBenchmarks.RagdollTubeBenchmark:AddLeg(System.Numerics.Vector3,BepuPhysics.RigidPose,BepuPhysics.BodyHandle,byref,int,int,BepuPhysics.RigidPose,BepuPhysics.CollidableProperty`1[DemoBenchmarks.RagdollTubeBenchmark+SubgroupCollisionFilter],BepuPhysics.Constraints.SpringSettings,BepuPhysics.Simulation):DemoBenchmarks.RagdollTubeBenchmark+RagdollLegHandles (FullOpts)
         -24 (-0.24% of base) : 1643.dasm - DemoBenchmarks.RagdollTubeBenchmark:AddArm(float,System.Numerics.Vector3,BepuPhysics.RigidPose,BepuPhysics.BodyHandle,byref,int,int,BepuPhysics.RigidPose,BepuPhysics.CollidableProperty`1[DemoBenchmarks.RagdollTubeBenchmark+SubgroupCollisionFilter],BepuPhysics.Constraints.SpringSettings,BepuPhysics.Simulation):DemoBenchmarks.RagdollTubeBenchmark+RagdollArmHandles (FullOpts)
         -16 (-1.40% of base) : 1242.dasm - BepuPhysics.Collidables.Compound:FindLocalOverlaps[BepuPhysics.CollisionDetection.CollisionTasks.ChildOverlapsCollection](byref,byref,byref,float,BepuUtilities.Memory.BufferPool,BepuPhysics.Collidables.Shapes,ulong):this (FullOpts)
         -12 (-0.64% of base) : 1630.dasm - DemoBenchmarks.RagdollTubeBenchmark:Initialize():this (FullOpts)
          -8 (-0.53% of base) : 1183.dasm - BepuPhysics.BoundingBoxHelpers:GetLocalBoundingBoxForSweep(BepuPhysics.Collidables.TypedIndex,BepuPhysics.Collidables.Shapes,byref,byref,byref,byref,byref,byref,float,byref,byref,byref) (FullOpts)
          -8 (-1.05% of base) : 1624.dasm - BepuPhysics.Collidables.Compound:RayTest[DemoBenchmarks.ShapeRayBenchmarksDeep+HitHandler](byref,byref,byref,BepuPhysics.Collidables.Shapes,byref):this (FullOpts)
          -8 (-0.73% of base) : 1254.dasm - DemoBenchmarks.ShapeRayBenchmarksDeep:Setup():this (FullOpts)

Top method regressions (percentages):
           8 (33.33% of base) : 1686.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Capsule():BepuPhysics.Collidables.CapsuleWide:this (FullOpts)
          24 (31.58% of base) : 955.dasm - BepuUtilities.Vector3Wide:ConditionalSelect(byref,byref,byref,byref) (FullOpts)
          56 (28.57% of base) : 986.dasm - BepuPhysics.CollisionDetection.CollisionTasks.BoxTriangleTester:ClipTriangleEdgeAgainstPlanes(byref,byref,byref,byref,byref,byref) (FullOpts)
         196 (23.90% of base) : 1005.dasm - BepuPhysics.CollisionDetection.CollisionTasks.ManifoldCandidateHelper:ReduceWithoutComputingDepths(byref,System.Numerics.Vector`1[int],int,System.Numerics.Vector`1[float],System.Numerics.Vector`1[float],int,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          72 (22.50% of base) : 1007.dasm - BepuPhysics.CollisionDetection.CollisionTasks.BoxPairTester:ClipBoxBEdgeAgainstBoxAFace(byref,byref,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
         220 (20.45% of base) : 953.dasm - BepuPhysics.CollisionDetection.CollisionTasks.ManifoldCandidateHelper:Reduce(byref,System.Numerics.Vector`1[int],int,byref,System.Numerics.Vector`1[float],byref,byref,byref,System.Numerics.Vector`1[float],System.Numerics.Vector`1[float],int,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          44 (20.00% of base) : 1662.dasm - BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref) (FullOpts)
          12 (20.00% of base) : 1690.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Triangle():BepuPhysics.Collidables.TriangleWide:this (FullOpts)
           4 (20.00% of base) : 954.dasm - System.Numerics.Vector:OnesComplement[int](System.Numerics.Vector`1[int]):System.Numerics.Vector`1[int] (FullOpts)
          68 (19.10% of base) : 1216.dasm - BepuUtilities.MathHelper:Acos(System.Numerics.Vector`1[float]):System.Numerics.Vector`1[float] (FullOpts)
          48 (16.67% of base) : 1666.dasm - BepuPhysics.Constraints.Contact.FrictionHelpers:ComputeFrictionCenter(byref,byref,byref,byref,byref,byref,byref,byref,byref) (FullOpts)
           4 (16.67% of base) : 1691.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Cylinder():BepuPhysics.Collidables.CylinderWide:this (FullOpts)
           4 (14.29% of base) : 1688.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:get_Box():BepuPhysics.Collidables.BoxWide:this (FullOpts)
          28 (11.67% of base) : 1704.dasm - DemoBenchmarks.GatherScatterBenchmarks:ScatterState():this (FullOpts)
           4 (11.11% of base) : 997.dasm - System.Numerics.Vector:EqualsAll[int](System.Numerics.Vector`1[int],System.Numerics.Vector`1[int]):ubyte (FullOpts)
          56 (10.14% of base) : 1710.dasm - DemoBenchmarks.GroupedCollisionTesterBenchmarks:CheapCollisionBenchmarks():System.Numerics.Vector`1[float]:this (FullOpts)
          60 ( 9.62% of base) : 1716.dasm - BepuPhysics.Constraints.CenterDistanceConstraintFunctions:Solve(byref,byref,byref,byref,byref,byref,float,float,byref,byref,byref,byref):this (FullOpts)
          24 ( 9.23% of base) : 2763.dasm - SixLabors.ImageSharp.Formats.Jpeg.Components.Decoder.ColorConverters.JpegColorConverterBase+FromYCbCrVector:ConvertCoreVectorizedInplace(byref):this (FullOpts)
         160 ( 8.40% of base) : 2974.dasm - SixLabors.ImageSharp.Processing.Processors.Convolution.KernelSamplingMap:BuildSamplingOffsetMap(int,int,SixLabors.ImageSharp.Rectangle):this (FullOpts)
          20 ( 8.06% of base) : 996.dasm - BepuPhysics.Collidables.TriangleWide:ComputeNondegenerateTriangleMask(byref,byref,byref,byref,byref) (FullOpts)

Top method improvements (percentages):
         -84 (-3.54% of base) : 767.dasm - DemoBenchmarks.CollisionBatcherTaskBenchmarks:Setup():this (FullOpts)
         -84 (-3.54% of base) : 1170.dasm - DemoBenchmarks.Sweeper:.ctor():this (FullOpts)
         -44 (-1.90% of base) : 1683.dasm - DemoBenchmarks.ConvexCollisionTesterBenchmarks:Setup():this (FullOpts)
         -16 (-1.40% of base) : 1242.dasm - BepuPhysics.Collidables.Compound:FindLocalOverlaps[BepuPhysics.CollisionDetection.CollisionTasks.ChildOverlapsCollection](byref,byref,byref,float,BepuUtilities.Memory.BufferPool,BepuPhysics.Collidables.Shapes,ulong):this (FullOpts)
          -8 (-1.05% of base) : 1624.dasm - BepuPhysics.Collidables.Compound:RayTest[DemoBenchmarks.ShapeRayBenchmarksDeep+HitHandler](byref,byref,byref,BepuPhysics.Collidables.Shapes,byref):this (FullOpts)
          -8 (-0.73% of base) : 1254.dasm - DemoBenchmarks.ShapeRayBenchmarksDeep:Setup():this (FullOpts)
         -12 (-0.64% of base) : 1630.dasm - DemoBenchmarks.RagdollTubeBenchmark:Initialize():this (FullOpts)
          -8 (-0.53% of base) : 1183.dasm - BepuPhysics.BoundingBoxHelpers:GetLocalBoundingBoxForSweep(BepuPhysics.Collidables.TypedIndex,BepuPhysics.Collidables.Shapes,byref,byref,byref,byref,byref,byref,float,byref,byref,byref) (FullOpts)
         -36 (-0.35% of base) : 1645.dasm - DemoBenchmarks.RagdollTubeBenchmark:AddLeg(System.Numerics.Vector3,BepuPhysics.RigidPose,BepuPhysics.BodyHandle,byref,int,int,BepuPhysics.RigidPose,BepuPhysics.CollidableProperty`1[DemoBenchmarks.RagdollTubeBenchmark+SubgroupCollisionFilter],BepuPhysics.Constraints.SpringSettings,BepuPhysics.Simulation):DemoBenchmarks.RagdollTubeBenchmark+RagdollLegHandles (FullOpts)
         -24 (-0.24% of base) : 1643.dasm - DemoBenchmarks.RagdollTubeBenchmark:AddArm(float,System.Numerics.Vector3,BepuPhysics.RigidPose,BepuPhysics.BodyHandle,byref,int,int,BepuPhysics.RigidPose,BepuPhysics.CollidableProperty`1[DemoBenchmarks.RagdollTubeBenchmark+SubgroupCollisionFilter],BepuPhysics.Constraints.SpringSettings,BepuPhysics.Simulation):DemoBenchmarks.RagdollTubeBenchmark+RagdollArmHandles (FullOpts)


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment