Created
          April 11, 2017 20:47 
        
      - 
      
- 
        Save pcordes/304c70cf8b83e5e4e3825b0cae7bf58e to your computer and use it in GitHub Desktop. 
    copy + bswap in-place vs. copy + swap on the fly, to simulate mmap vs. read
  
        
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | ;; 3 versions, only one enabled with %if 1 NASM/YASM preprocessor stuff, like C #if 0 / #if 1 | |
| ;; shuffle on the fly | |
| ;; copy + in-place | |
| ;; read-only up/down/bidir | |
| ;;; ~/bin/asm-link loop-up-down.asm && disas loop-up-down && ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,L1-dcache-load-misses,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk -r4 ./loop-up-down | |
| ;;;~/bin/asm-link loop-up-down.asm && rm loop-up-down.o && disas loop-up-down && nice ocperf.py stat -etask-clock,page-faults,cycles,L1-dcache-loads,LLC-loads,LLC-load-misses,instructions,dtlb_store_misses.miss_causes_a_walk,dtlb_load_misses.stlb_hit,dtlb_load_misses.miss_causes_a_walk -r3 ./loop-up-down | |
| default rel | |
| section .data | |
| align 1024*1024*2 | |
| times 4096+SRC_MISALIGN db 1 ; 4096-151 | |
| srcbuf: times 1024*BUFKB db 0 ; not BSS, so it's private memory-mapped, not to the zero-page | |
| section .bss | |
| align 1024*1024*8 | |
| dstbuf: resb 1024*BUFKB ; initially all mapped to the same zero page, which fits in L1. Unless it's a hugepage | |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Copy-and-shuffle on the fly ;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
| ;; (like mmap but the page-fault penalty only happens on the first iteration): | |
| %if 1 | |
| section .text | |
| global _start | |
| _start: | |
| vpxor ymm15, ymm15 | |
| ; on-the-fly copy+shuffle without an infloop running: | |
| ; buf=128M : up: 330M/s @2.9GHz. src is misaligned, dst is aligned | |
| ; buf=512M unrolled by 8 : 277M/s @ 2.45GHz. aligned. With NT stores: 302M/s @2.3GHz | |
| ; on-the-fly copy with infloop running: somewhat faster than memcpy + process in-place | |
| ; buf=32M : up: 385M/s @3.75GHz. src and dst are aligned | |
| ; buf=32M : up: 385M/s @3.75GHz. src is 13B misaligned, dst is aligned = 12320 MByte/s | |
| ; buf=128M: up: 350M/s @3.6GHz. src is 13B misaligned, dst is aligned. Other misalignments, e.g. 4096-40, are the same. Can't find a 4k problem | |
| ; buf=128M: up: 350M/s @3.6GHz. src is 13B misaligned, dst is aligned | |
| ; buf=512M unroll=8 : 300M/s @ 3.14GHz. aligned. NT stores: 340M/s @3.0GHz. No change for unroll by 4 | |
| ; buf=1M aligned, unroll=4 up: 957M+-2 | |
| ; buf=3M aligned, unroll=4 up: 883-906M/s. unroll=8: 880-913M/s. | |
| ; buf=3M aligned unroll=4 up: 920-940M/s (a few minutes later). interleaved (load+shuffle+store, repeat): 903-940M/s | |
| ;;; Can't reproduce: buf=3M up: misaligned by -151: 940M/s. aligned: 975M/s. | |
| ; buf=3M aligned unroll=4 NT 1013-1028M/s. NT interleaved (load1+shuffle1+store1): 1006-1024M/s. unroll=1,2,3, or 8: same | |
| ; buf=3M NT stores, mislign -71 (worst for u=4): u=4: 987-1000M. u=3: 973-983. u=2: 985 u=1: 969-980, one outlier at 1000 | |
| ;;NT interleaved : 1006-1024M/s. unroll=1,2,3, or 8: same | |
| ; buf=256k up: src misaligned by -171: 970M/s. aligned 1005M/s (dst on a 4k boundary, src 171B before a boundary) | |
| ; buf=32k up: src misaligned by -171: 1180M/s. aligned 1605M/s. by -371: 1430M/s: 4k aliasing is much less of a problem | |
| ; buf=16k up: src misaligned by -171: 1298M/s. aligned 3500M/s. by -371: 1850M/s | |
| ;; misalignment from -131 to -251: ~1285 to 1300M/s. -121: 1526M/s -261: 1774M/s. Worst case: -131 to -151: 1285M/s. | |
| BUFKB equ 1024*32 | |
| SRC_MISALIGN equ 0 | |
| %define STRIDE 32 | |
| %define STORE vmovdqu ;vmovntdq | |
| %define UNROLL 4 | |
| mov rbx, 200000000 / (1024*BUFKB/32) | |
| .repeatloop: | |
| mov esi, srcbuf | |
| mov edi, dstbuf | |
| ALIGN 16 | |
| .bufloop: | |
| %assign i 0 | |
| %rep UNROLL | |
| vmovdqu ymm %+ i, [rsi+STRIDE*i] | |
| ; vpshufb ymm %+ i, ymm15 | |
| ; STORE [rdi+STRIDE*i], ymm %+ i | |
| %assign i i+1 | |
| %endrep | |
| %assign i 0 | |
| %rep UNROLL | |
| vpshufb ymm %+ i, ymm15 | |
| %assign i i+1 | |
| %endrep | |
| %assign i 0 | |
| %rep UNROLL | |
| STORE [rdi+STRIDE*i], ymm %+ i | |
| %assign i i+1 | |
| %endrep | |
| ;; vmovdqu ymm0, [rsi+STRIDE*0] | |
| ;; vmovdqu ymm1, [rsi+STRIDE*1] | |
| ;; vmovdqu ymm2, [rsi+STRIDE*2] | |
| ;; vmovdqu ymm3, [rsi+STRIDE*3] | |
| ;; ;; vmovdqu ymm4, [rsi+STRIDE*4] | |
| ;; ;; vmovdqu ymm5, [rsi+STRIDE*5] | |
| ;; ;; vmovdqu ymm6, [rsi+STRIDE*6] | |
| ;; ;; vmovdqu ymm7, [rsi+STRIDE*7] | |
| ;; vpshufb ymm0, ymm15 | |
| ;; vpshufb ymm1, ymm15 | |
| ;; vpshufb ymm2, ymm15 | |
| ;; vpshufb ymm3, ymm15 | |
| ;; ;; vpshufb ymm4, ymm15 | |
| ;; ;; vpshufb ymm5, ymm15 | |
| ;; ;; vpshufb ymm6, ymm15 | |
| ;; ;; vpshufb ymm7, ymm15 | |
| ;; STORE [rdi+STRIDE*0], ymm0 | |
| ;; STORE [rdi+STRIDE*1], ymm1 | |
| ;; STORE [rdi+STRIDE*2], ymm2 | |
| ;; STORE [rdi+STRIDE*3], ymm3 | |
| ;; ;; STORE [rdi+STRIDE*4], ymm4 | |
| ;; ;; STORE [rdi+STRIDE*5], ymm5 | |
| ;; ;; STORE [rdi+STRIDE*6], ymm6 | |
| ;; ;; STORE [rdi+STRIDE*7], ymm7 | |
| add esi, STRIDE*UNROLL ;128 | |
| add edi, STRIDE*UNROLL ;128 | |
| cmp edi, dstbuf + 1024*BUFKB - STRIDE*UNROLL | |
| jbe .bufloop | |
| dec rbx | |
| jg .repeatloop | |
| xor edi,edi | |
| mov eax,231 | |
| syscall | |
| %endif | |
| ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; read() and process in-place ;;;;;;;;;;;;;;;;;;;;;;;; | |
| %if 0 | |
| section .text | |
| global _start | |
| _start: | |
| vpxor ymm15, ymm15 | |
| ;; SKL i7-6700k @ 3.9GHz(max) with DDR4-2666: | |
| ;24k just in-place bswap: up or down: ~3850M/s L1d loads, i.e. vectors loaded+shuffled+stored / sec | |
| ;128k just in-place bswap: down: ~2330M up: ~2330M | |
| ;128+64k down: 2320M/s up: 2300M/s | |
| ;256k: down: 2325 M/s up: 2300 M/s | |
| ;1M : down: 1590 M/s up: 1607 M/s | |
| ;4M : down: 1550 up: 1576 | |
| ;8M : same 1550 | |
| ;8M+32k : 1500 | |
| ;8M+64k : up 1450 | |
| ;8M+128k : up 1370 | |
| ;8M+256k : up 1250 | |
| ;10M : down: 735M up: 870M | |
| ;; Actually doing the rep movs: | |
| ;buf=8MB block=128kB ; down: ~340Mload/s | |
| ;; with no infloop | |
| ;buf=16M block=4M : down: 251M/s up: 250M vectors copied+swapped / sec (from L1d loads / sec, which doesn't count rep movsb), @2.67GHz | |
| ;buf=16M block=128k : down: 264M/s up: 261M/s (both @ ~2.67GHz) | |
| ; without cache-blocking | |
| ;buf=10M block=10M : down: 242M (@2.67GHz) up: 240M (@2.67GHz). no infloop | |
| ;buf=10M block=10M : down: 290M (@3.8GHz) up: 290M (@3.8GHz). infloop running on another core to keep speed up | |
| ;buf=32M block=32M : down: 244M (@3.8GHz) up: 240M (@3.8GHz) | |
| ;; with an infloop running to keep clock speed up. | |
| ;; src+dst hot in L3: | |
| ;buf=3M block=256k ; down: 610M/s up: 610M/s @ 3.89GHz. Without infloop: 605M/s to 615M/s | |
| ;buf=3M block=128k ; down: 620M/s up: 598M/s @ 3.82GHz | |
| ;buf=3M block=64k ; down: 637M/s up: 610M/s @ 3.88GHz. movntdq: 280M/s up/down | |
| ;buf=3M block=48k ; down: 655M/s up: 610M/s @ 3.88GHz | |
| ;buf=3M block=32k ; down: 660 to 688M/s up: 624M/s @ 3.88GHz | |
| ;buf=3M block=16k ; down: 777M/s up: 756 to 767M/s @ 3.89GHz. movntdq: 225M/s up/down | |
| ;; Larger than L3: | |
| ;buf=32M block=1M : down: 304M/s up: 305M @3.75GHz | |
| ;buf=32M block=512k : down: 311M/s up: 305M @3.75GHz | |
| ;buf=32M block=256k : down: 312M/s up: 311M @3.75GHz | |
| ;buf=32M block=128k : down: 318M/s up: 315M @3.75GHz not sensitive to src misalignment | |
| ;buf=32M block=64k : down: 325M up: 315M/s vectors/s @ 3.75GHz. up NT: 235M. down NT: 227M | |
| ;buf=32M block=32k : down: 330M up: 320M/s @3.75GHz | |
| ;buf=32M block=24k : down: 337M up: 331M/s @3.75GHz | |
| ;buf=32M block=16k : down: 341M up: 351M/s @3.75GHz ; mostly this is from overlapping rep movsb with the loop, not L1 hits | |
| ;;buf=32M block=8k : down: 325M up: 346M/s @3.75GHz | |
| ;;buf=32M block=4k : down: 300M up: 335M/s @3.75GHz | |
| %define STRIDE -32 | |
| %define STORE vmovdqu | |
| ;%define STORE vmovntdq | |
| SRC_MISALIGN equ 0 | |
| BLOCK equ 512 | |
| BUFKB equ 1024*32 | |
| mov rbx, 200000000 / (1024*BUFKB/128) / 4 | |
| .repeatloop: | |
| mov esi, srcbuf | |
| mov edi, dstbuf | |
| .bufloop: | |
| ; read() a block | |
| mov ecx, 1024*BLOCK | |
| mov eax, srcbuf+1024*BUFKB | |
| sub eax, esi | |
| cmp ecx, eax | |
| cmovg ecx, eax ; bytes = min(BLOCK, bytes left) | |
| %if STRIDE > 0 ; always-ascending memcpy from "pagecache" | |
| mov eax, edi ; start | |
| rep movsb | |
| mov edx, edi ; end | |
| ;; add edi, ecx ;; skip the copy | |
| ;; mov eax, dstbuf | |
| ;; mov edx, dstbuf+1024*BLOCK | |
| %else | |
| lea edx, [rdi+STRIDE] ; end | |
| rep movsb | |
| lea eax, [rdi+STRIDE] ; start | |
| ;; add edi, ecx | |
| ;; mov eax, dstbuf+1024*BLOCK + STRIDE | |
| ;; mov edx, dstbuf + STRIDE | |
| %endif | |
| ; and bswap it | |
| ALIGN 16 | |
| .blockloop: | |
| vmovdqa ymm0, [rax+STRIDE*0] | |
| vmovdqa ymm1, [rax+STRIDE*1] | |
| vmovdqa ymm2, [rax+STRIDE*2] | |
| vmovdqa ymm3, [rax+STRIDE*3] | |
| vpshufb ymm0, ymm15 | |
| vpshufb ymm1, ymm15 | |
| vpshufb ymm2, ymm15 | |
| vpshufb ymm3, ymm15 | |
| STORE [rax+STRIDE*0], ymm0 | |
| STORE [rax+STRIDE*1], ymm1 | |
| STORE [rax+STRIDE*2], ymm2 | |
| STORE [rax+STRIDE*3], ymm3 | |
| add eax, STRIDE*4 ;128 | |
| cmp eax, edx | |
| jne .blockloop | |
| cmp edi, dstbuf + 1024*BUFKB | |
| jb .bufloop | |
| dec rbx | |
| jg .repeatloop | |
| xor edi,edi | |
| mov eax,231 | |
| syscall | |
| %endif | |
| ;;;;;;;;;;;;;;;;;;;;;;;;;; Read-only loop ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; | |
| %if 0 | |
| ;;; Reading memory in ascending order is faster on Skylake, for read-only loops | |
| ;;; But alternating up/down loops are a big win when the buffer size is >= cache size. (For L1, L2, and L3). | |
| ;;; Hugepages probably make TLB effects insignificant. | |
| section .bss | |
| align 1024*1024*8 | |
| buf: resb 1024*KB ; initially all mapped to the same zero page, which fits in L1. Unless it's a hugepage | |
| section .text | |
| default rel | |
| global _start | |
| _start: | |
| mov edi, buf | |
| mov eax, 0 | |
| mov ecx, 1024 * KB / 8 | |
| rep stosq ; dirty the buffer | |
| %define DOWNONLY 0 | |
| %define STRIDE 32 | |
| ;; for large buffers (memory-bound), SKL downclocks if there aren't threads on other cores. Without that, ironically vsqrtps runs faster than vmovaps because it keeps the core clock high | |
| ;;; For very large buffers, down is slower than up. Even though this does trivial work, so demand-loads outpace HW prefetching. | |
| ;KB equ 1024*10 ; down: 895+-1% up: 1010+-2% bidir: 1210+-1% (vxorps with infloop active) | |
| ;KB equ 1024*8 - 1 ; up: ~1300M/s+-2% down: ~1350+-10%. bidir: 1525+-3% | |
| ;;; L3 hits | |
| ;KB equ 1024 * 7 ; downonly: ~1920Mloads/s +-50. uponly: 1980M/s +-50. bidir: 2000M/s +-40?. With vxorps or vaddps, with a busy-loop on another core to keep the clock at 3.9GHz, which it should do anyway with an L3 working set | |
| ; at 2MiB: hard to measure any difference | |
| ;KB equ 1024 * 1 ; downonly and uponly: ~2000Mload/s or maybe 2100, bidir: ~2300M/s (vxorps) | |
| ;KB equ 1024 / 2 ; down or up: 2140M/s bidir: 2650 (vxorps) | |
| ;KB equ 256+128 ; down ~2300 up: 2340 bidir: 2940 | |
| ;;; L2 hits | |
| ;;;;KB equ 1024 / 4 = L2; up: 2670 to 2930. down: 2620 to 2860, but usually only ~2700. bidir: 3340 to 3450. L1 miss rate ~ 44% (down from 50%) | |
| ;;KB equ 128 ; up: 3250 to 3600. down: ~3250. bidir: 3960 to 4215 (better than one per clock with vxorps). (vaddps=~3620M/s, nearly hitting its latency bottleneck) | |
| ;;KB equ 48 ; up: 3650, still 50% L1 miss rate. down: 3450-3650. bidir: ~5600Mloads/sec. L1 miss rate ~= 17% | |
| KB equ 256+128 | |
| %define INSN vxorps | |
| %define DO_DOWNLOOP 0 | |
| %if DO_DOWNLOOP | |
| %define DOWNONLY 0 | |
| %endif | |
| ; mov rcx, 5000000000 / (1024*KB/128) / 8 | |
| mov rcx, 1000000000 / (1024*KB/(STRIDE*4)) / (4*(1+DO_DOWNLOOP-DOWNONLY)) | |
| ALIGN 32 | |
| .repeatloop: | |
| mov esi, buf | |
| %if DOWNONLY != 1 | |
| .workloop: | |
| INSN ymm0, [rsi+STRIDE*0] | |
| INSN ymm1, [rsi+STRIDE*1] | |
| INSN ymm2, [rsi+STRIDE*2] | |
| INSN ymm3, [rsi+STRIDE*3] | |
| add esi, STRIDE*4 ;128 | |
| cmp esi, buf-STRIDE*3 + 1024*KB | |
| jb .workloop | |
| %else | |
| mov esi, buf + 1024*KB | |
| %endif | |
| %if DO_DOWNLOOP | |
| .downloop: | |
| sub esi, STRIDE*4 ;128 | |
| INSN ymm3, [rsi+STRIDE*3] | |
| INSN ymm2, [rsi+STRIDE*2] | |
| INSN ymm1, [rsi+STRIDE*1] | |
| INSN ymm0, [rsi+STRIDE*0] | |
| cmp esi, buf | |
| ja .downloop | |
| %endif | |
| dec rcx | |
| jnz .repeatloop | |
| xor edi,edi | |
| mov eax,231 | |
| syscall | |
| %endif | |
  
    Sign up for free
    to join this conversation on GitHub.
    Already have an account?
    Sign in to comment