Created
October 19, 2018 23:49
-
-
Save Const-me/3290266d2a5f51409eb813d39b28007c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
page ,132 | |
title memcpy - Copy source memory bytes to destination | |
;*** | |
;memcpy.asm - contains memcpy and memmove routines | |
; | |
; Copyright (c) Microsoft Corporation. All rights reserved. | |
; | |
;Purpose: | |
; memcpy() copies a source memory buffer to a destination buffer. | |
; Overlapping buffers are not treated specially, so propogation may occur. | |
; memmove() copies a source memory buffer to a destination buffer. | |
; Overlapping buffers are treated specially, to avoid propogation. | |
; | |
;******************************************************************************* | |
include ksamd64.inc | |
subttl "memcpy" | |
;*** | |
;memcpy - Copy source buffer to destination buffer | |
; | |
;Purpose: | |
; memcpy() copies a source memory buffer to a destination memory buffer. | |
; This routine does NOT recognize overlapping buffers, and thus can lead | |
; to propogation. | |
; For cases where propogation must be avoided, memmove() must be used. | |
; | |
; Algorithm: | |
; | |
; void * memcpy(void * dst, void * src, size_t count) | |
; { | |
; void * ret = dst; | |
; | |
; /* | |
; * copy from lower addresses to higher addresses | |
; */ | |
; while (count--) | |
; *dst++ = *src++; | |
; | |
; return(ret); | |
; } | |
; | |
;memmove - Copy source buffer to destination buffer | |
; | |
;Purpose: | |
; memmove() copies a source memory buffer to a destination memory buffer. | |
; This routine recognize overlapping buffers to avoid propogation. | |
; For cases where propogation is not a problem, memcpy() can be used. | |
; | |
; Algorithm: | |
; | |
; void * memmove(void * dst, void * src, size_t count) | |
; { | |
; void * ret = dst; | |
; | |
; if (dst <= src || dst >= (src + count)) { | |
; /* | |
; * Non-Overlapping Buffers | |
; * copy from lower addresses to higher addresses | |
; */ | |
; while (count--) | |
; *dst++ = *src++; | |
; } | |
; else { | |
; /* | |
; * Overlapping Buffers | |
; * copy from higher addresses to lower addresses | |
; */ | |
; dst += count - 1; | |
; src += count - 1; | |
; | |
; while (count--) | |
; *dst-- = *src--; | |
; } | |
; | |
; return(ret); | |
; } | |
; | |
; | |
;Entry: | |
; void *dst = pointer to destination buffer | |
; const void *src = pointer to source buffer | |
; size_t count = number of bytes to copy | |
; | |
;Exit: | |
; Returns a pointer to the destination buffer in AX/DX:AX | |
; | |
;Uses: | |
; CX, DX | |
; | |
;Exceptions: | |
;******************************************************************************* | |
extrn __favor:dword | |
extrn __ImageBase:byte | |
extrn __memcpy_nt_iters:qword ; defined in cpu_disp.c | |
__FAVOR_ENFSTRG equ 1 | |
__FAVOR_SMSTRG equ 2 | |
; Code for copying block using enhanced fast strings. | |
; This code needs to be in a separate routine because | |
; it uses non-volatile registers which must be saved | |
; and restored for exception handling. | |
NESTED_ENTRY memcpy_repmovs, _TEXT | |
push_reg rdi | |
push_reg rsi | |
.endprolog | |
mov rax, r11 ; return original destination pointer | |
mov rdi, rcx ; move destination pointer to rdi | |
mov rcx, r8 ; move length to rcx | |
mov rsi, r10 ; move source pointer to rsi | |
rep movsb ; copy source to destination buffer | |
.beginepilog | |
pop rsi | |
pop rdi | |
ret ; return | |
NESTED_END memcpy_repmovs, _TEXT | |
; Main memmove/memcpy routine | |
public memmove | |
LEAF_ENTRY_ARG3 memcpy, _TEXT, dst:ptr byte, src:ptr byte, count:dword | |
OPTION PROLOGUE:NONE, EPILOGUE:NONE | |
memmove = memcpy | |
mov r11, rcx ; save destination address | |
mov r10, rdx ; save source address | |
cmp r8, 16 ; if 16 bytes or less | |
jbe MoveBytes16 ; go move them quick | |
cmp r8, 32 ; check for length <= 32 (we know its > 16) | |
jbe Move17to32 ; go handle lengths 17-32 as a special case | |
sub rdx, rcx ; compute offset to source buffer | |
jae CopyUp ; if above or equal, go move up | |
lea rax, [r8+r10] ; else check that src+count < dst | |
cmp rcx, rax ; (src + count) < dst | |
jb CopyDown ; no, buffers overlap go move downward | |
CopyUp: | |
cmp r8, 128 | |
jbe XmmCopySmall | |
bt __favor, __FAVOR_ENFSTRG ; check for ENFSTRG (enhanced fast strings) | |
jnc XmmCopyUp ; If Enhanced Fast String not available, use XMM | |
jmp memcpy_repmovs | |
; Handle lengths 17-32 as a special case using XMM registers. | |
; This allows the regular code to assume that there will always be enough | |
; bytes for the "deferred" block of 16. Also any case that can be handled | |
; with just two stores is handled with just two stores, the regular code | |
; will always do 3 stores for unaligned moves that have a remainder. | |
; No assumptions are made here about buffer alignment or overlap. | |
; We load the entire string to be moved in 2 xmm registers before storing | |
; anything, so this works for any arrangement of overlapping buffers. | |
; | |
; dst is in rcx (can modify) and r11 (must preserve for return value) | |
; src is in r10 (should preserve for consistency) | |
; rdx is the offset from the dst to the source, so rcx + rdx is the src | |
; r8 is the length, and is known to be 17 <= r8 <= 32 | |
; | |
; When length < 32 the first 16 bytes includes some of the last 16 bytes | |
; and we will store (length - 32) bytes twice. (E.g. in the worst case | |
; of len 17 we are storing the middle 15 bytes of the buffer twice). | |
; This is still much faster than doing logic and branching with 1, 2, 4 | |
; and 8 byte conditional copies. | |
; | |
align 16 | |
Move17to32: | |
movups xmm0, [rdx] ; load first 16 bytes of src | |
movups xmm1, (-16)[rdx + r8] ; load last 16 bytes of src | |
movups [rcx], xmm0 ; store first 16 bytes of dst | |
movups (-16)[rcx + r8], xmm1 ; store last 16 bytes of dst | |
mov rax, rcx ; set destination address | |
ret | |
; | |
; Move residual bytes. | |
; | |
align 16 | |
MoveBytes16: | |
mov rax, rcx ; mov destination address to rax | |
lea r9, OFFSET __ImageBase | |
IFDEF _VCRUNTIME_BUILD_QSPECTRE | |
and r8, 1Fh ; bound r8 to 0-31 in speculation scenarios (17-31 is padding) | |
ENDIF | |
mov ecx, [(IMAGEREL MoveSmall) + r9 +r8*4] | |
add rcx, r9 | |
jmp rcx | |
MoveSmall dd IMAGEREL MoveSmall0 | |
dd IMAGEREL MoveSmall1 | |
dd IMAGEREL MoveSmall2 | |
dd IMAGEREL MoveSmall3 | |
dd IMAGEREL MoveSmall4 | |
dd IMAGEREL MoveSmall5 | |
dd IMAGEREL MoveSmall6 | |
dd IMAGEREL MoveSmall7 | |
dd IMAGEREL MoveSmall8 | |
dd IMAGEREL MoveSmall9 | |
dd IMAGEREL MoveSmall10 | |
dd IMAGEREL MoveSmall11 | |
dd IMAGEREL MoveSmall12 | |
dd IMAGEREL MoveSmall13 | |
dd IMAGEREL MoveSmall14 | |
dd IMAGEREL MoveSmall15 | |
dd IMAGEREL MoveSmall16 | |
IFDEF _VCRUNTIME_BUILD_QSPECTRE | |
dd 15 dup (IMAGEREL MoveSmall0) ; 17 -> 31 padding | |
ENDIF | |
align 16 | |
MoveSmall0:: | |
ret | |
MoveSmall2:: | |
movzx ecx, word ptr [rdx] ; get two byte from source | |
mov [rax], cx ; write two bytes to destination | |
ret | |
MoveSmall8:: | |
mov rcx, qword ptr [rdx] ; get eight bytes from source | |
mov [rax], rcx ; write eight bytes to destination | |
ret | |
MoveSmall3:: | |
movzx ecx, word ptr [rdx] ; get two bytes from source | |
movzx r8d, byte ptr 2[rdx] ; get last byte from source | |
mov [rax], cx ; write two bytes to destination | |
mov 2[rax], r8b ; write last byte to destination | |
ret | |
MoveSmall1:: | |
movzx ecx, byte ptr [rdx] ; get byte from source | |
mov [rax], cl ; write byte to destination | |
ret | |
MoveSmall16:: | |
movdqu xmm0, xmmword ptr [rdx] ; get sixteen bytes from source | |
movdqu xmmword ptr [rax], xmm0 ; write sixteen bytes to destination | |
ret | |
align 16 | |
MoveSmall11:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
movzx ecx, word ptr 8[rdx] ; get two bytes from source | |
movzx r9d, byte ptr 10[rdx] ; get last byte from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], cx ; write two bytes to destination | |
mov 10[rax], r9b ; write last byte to destination | |
ret | |
MoveSmall4:: | |
mov ecx, dword ptr [rdx] ; get four bytes from source | |
mov [rax], ecx ; write four bytes to destination | |
ret | |
align 16 | |
MoveSmall5:: | |
mov ecx, dword ptr [rdx] ; get four bytes from source | |
movzx r8d, byte ptr 4[rdx] ; get last byte from source | |
mov [rax], ecx ; write four bytes to destination | |
mov 4[rax], r8b ; write last byte to destination | |
ret | |
align 16 | |
MoveSmall6:: | |
mov ecx, dword ptr [rdx] ; get four bytes from source | |
movzx r8d, word ptr 4[rdx] ; get two bytes from source | |
mov [rax], ecx ; write four bytes to destination | |
mov 4[rax], r8w ; write two bytes to destination | |
ret | |
align 16 | |
MoveSmall7:: | |
mov ecx, dword ptr [rdx] ; get four bytes from source | |
movzx r8d, word ptr 4[rdx] ; get two bytes from source | |
movzx r9d, byte ptr 6[rdx] ; get last byte from source | |
mov [rax], ecx ; write four bytes to destination | |
mov 4[rax], r8w ; write two bytes to destination | |
mov 6[rax], r9b ; write last byte to destination | |
ret | |
MoveSmall13:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
mov ecx, dword ptr 8[rdx] ; get four bytes from source | |
movzx r9d, byte ptr 12[rdx] ; get last bytes from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], ecx ; write four bytes to destination | |
mov 12[rax], r9b ; write last byte to destination | |
ret | |
align 16 | |
MoveSmall9:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
movzx ecx, byte ptr 8[rdx] ; get last byte from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], cl ; write last byte to destination | |
ret | |
align 16 | |
MoveSmall10:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
movzx ecx, word ptr 8[rdx] ; get two bytes from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], cx ; write two bytes to destination | |
ret | |
align 16 | |
MoveSmall12:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
mov ecx, dword ptr 8[rdx] ; get four bytes from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], ecx ; write four bytes to destination | |
ret | |
align 16 | |
MoveSmall14:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
mov ecx, dword ptr 8[rdx] ; get four bytes from source | |
movzx r9d, word ptr 12[rdx] ; get two bytes from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], ecx ; write four bytes to destination | |
mov 12[rax], r9w ; write two bytes to destination | |
ret | |
align 16 | |
MoveSmall15:: | |
mov r8, qword ptr [rdx] ; get eight bytes from source | |
mov ecx, dword ptr 8[rdx] ; get four bytes from source | |
movzx r9d, word ptr 12[rdx] ; get two bytes from source | |
movzx r10d, byte ptr 14[rdx] ; get last byte from source | |
mov [rax], r8 ; write eight bytes to destination | |
mov 8[rax], ecx ; write four bytes to destination | |
mov 12[rax], r9w ; write two bytes to destination | |
mov 14[rax], r10b ; write last byte to destination | |
ret | |
; | |
; Memcpy up using SSE instructions. | |
; | |
; Preconditions: | |
; destination in rcx (destructable) and r11 (must preserve for return value) | |
; source in r10 | |
; length in r8, must be greater than 16 | |
; offset from dest to src in rdx | |
; source addr > dest addr or else buffers don't overlap | |
; | |
; Aligned stores are much faster on AMD hardware, so start by moving however many | |
; bytes must be moved so updated dst is 16-byte aligned. We need to copy | |
; (16 - (dest mod 16)) bytes, but it's faster to just do an unaligned copy of 16 | |
; bytes and then start the aligned loop as usual at ((dest - (dest mod 16)) + 16). | |
; This results in (dest mod 16) bytes being copied twice. This is a lot faster | |
; than a bunch of code to copy maybe 1 then maybe 2 then maybe 4 then maybe 8 | |
; bytes to achieve dst alignement. | |
; | |
; We know the src address is greater than the dst, but not by how much. In the | |
; case where the difference is less than 16 we must be careful about the bytes | |
; that will be stored twice. We must do both loads before either store, or the | |
; second load of those bytes will get the wrong values. We handle this by | |
; loading the last 16 bytes that can be stored at an aligned address, but | |
; deferring the store of those bytes to the remainder code, so it can load the | |
; remainder before storing the deferred bytes. Since either or both of the two | |
; loops can be skipped, the preconditions needed by the remainder code must | |
; also apply to the loops. These conditions are: | |
; - r8 is the count remaining, not including the deferred bytes | |
; - [rcx + rdx] and [rcx] as usual point to the src and dst where the number | |
; number of bytes given by r8 should be copied from and to. | |
; - xmm0 holds the 16 deferred bytes that need to be stored at (-16)[rcx] | |
; | |
align 16 | |
XmmCopyUp: | |
movups xmm0, [rcx + rdx] ; load deferred bytes | |
add r8, rcx ; r8 points 1 byte past end | |
add rcx, 16 ; update to next block. | |
test r11b, 15 ; test if destination aligned | |
jz XmmCopyLargeTest ; go try 128-byte blocks | |
; | |
; Move alignment bytes. | |
; | |
XmmCopyAlign: | |
movaps xmm1, xmm0 ; save initial bytes in xmm1 | |
and rcx, -16 ; rcx is 16 bytes past first 16-byte align point | |
movups xmm0, [rcx + rdx] ; load aligned deferred-store bytes | |
add rcx, 16 ; update to next block | |
movups [r11], xmm1 ; now safe to store 16 unaligned at start | |
; | |
; See if we can move any 128-byte blocks. | |
; | |
XmmCopyLargeTest: | |
sub r8, rcx ; r8 restored to count remaining | |
mov r9, r8 ; copy count of bytes remaining | |
shr r9, 7 ; compute number of 128-byte blocks | |
jz XmmCopySmallTest ; if z jump around to 2nd loop | |
movaps (-16)[rcx], xmm0 ; going into 1st loop, ok to store deferred bytes | |
cmp r9, __memcpy_nt_iters ; threshold defined by cpu_disp.c | |
jna short XmmCopyLargeInner ; jump into 1st loop | |
jmp XmmCopyLargeInnerNT ; long enough so non-temporal worth it, jump into nt loop | |
; | |
; Move 128-byte blocks | |
; | |
align 16 | |
; | |
; When possible, non-mov instructions are put between a load and store | |
; so their execution can overlap the store. | |
; The jnz is likewise moved earlier to come before the last store pair. | |
; Pairs of loads/stores are used to overlap cache latencies. | |
; movups and movaps are equally fast on aligned storage, we use movaps | |
; to document movs that we *know* are going to be aligned, movups otherwise. | |
; xmm0 must be preloaded before jumping into this loop, and the last | |
; store must be deferred (and the bytes to store left in xmm0) for the | |
; following loop and/or the remainder code. | |
; | |
XmmCopyLargeOuter: | |
movaps (-32)[rcx], xmm0 ; store 7th chunk from prior iteration | |
movaps (-16)[rcx], xmm1 ; store 8th chunk from prior iteration | |
XmmCopyLargeInner: ; enter loop here with xmm0 preloaded. | |
movups xmm0, [rcx + rdx] ; load first 16 byte chunk | |
movups xmm1, 16[rcx + rdx] ; load 2nd 16 byte chunk | |
add rcx, 128 ; advance destination address | |
movaps (-128)[rcx], xmm0 ; store first 16 byte chunk | |
movaps (-112)[rcx], xmm1 ; store 2nd 16 byte chunk | |
movups xmm0, (-96)[rcx + rdx] ; load 3rd chunk | |
movups xmm1, (-80)[rcx + rdx] ; load 4th chunk | |
dec r9 ; dec block counter (set cc for jnz) | |
movaps (-96)[rcx], xmm0 ; store 3rd chunk | |
movaps (-80)[rcx], xmm1 ; store 4th chunk | |
movups xmm0, (-64)[rcx + rdx] ; load 5th chunk | |
movups xmm1, (-48)[rcx + rdx] ; load 6th chunk | |
movaps (-64)[rcx], xmm0 ; store 5th chunk | |
movaps (-48)[rcx], xmm1 ; store 6th chunk | |
movups xmm0, (-32)[rcx + rdx] ; load 7th chunk | |
movups xmm1, (-16)[rcx + rdx] ; load 8th chunk | |
jnz XmmCopyLargeOuter ; loop if more blocks | |
XmmCopyFinish: ; non-temporal codepath rejoins here | |
movaps (-32)[rcx], xmm0 ; store 7th chunk from final iteration | |
and r8, 127 ; compute remaining byte count | |
movaps xmm0, xmm1 ; 8th chunk becomes deferred bytes | |
jmp XmmCopySmallTest | |
XmmCopySmall: | |
bt __favor, __FAVOR_SMSTRG ; check if string copy should be used. | |
jc memcpy_repmovs | |
movups xmm0, [rcx + rdx] ; load deferred bytes | |
add rcx, 16 | |
sub r8, 16 | |
; | |
; See if we have any 16-byte blocks left to move | |
; | |
XmmCopySmallTest: | |
mov r9, r8 ; copy count of bytes remaining | |
shr r9, 4 ; compute number of 16-byte blocks | |
jz short XmmCopyTrail ; on z, no 16-byte blocks, skip 2nd loop | |
align 16 | |
XmmCopySmallLoop: | |
movups (-16)[rcx], xmm0 ; the first time through this is the | |
; store of the deferred bytes from above | |
movups xmm0, [rcx + rdx] ; load a block | |
add rcx, 16 ; advance dest addr (store is deferred) | |
dec r9 | |
jnz XmmCopySmallLoop | |
XmmCopyTrail: | |
and r8, 15 ; compute remaining byte count | |
jz short XmmCopyReturn ; if z, no remainder bytes to move | |
; | |
; Handle remainder bytes. | |
; | |
; As at the start, we are going to do an unaligned copy of 16 bytes which will double-write | |
; some bytes. We must not touch rcx or xmm0 because they have what we need to store the | |
; deferred block. We use rax to point to the first byte after the end of the buffer and | |
; back up from there. Note rax is pointing to an address we must not read or write! | |
; | |
lea rax, [rcx+r8] ; make rax point one past the end | |
movups xmm1, (-16)[rax + rdx] ; load last 16 bytes of source buffer | |
movups (-16)[rax], xmm1 ; write last 16 bytes, including 16-r8 bytes | |
; from the last aligned block which we are about to | |
; overstore with identical values | |
XmmCopyReturn: | |
movups (-16)[rcx], xmm0 ; store the last deferred aligned block | |
mov rax, r11 ; we must return the original destination address | |
ret ; | |
; | |
; Move 128-byte blocks non-temporal | |
; | |
align 16 | |
; | |
; non-temporal is exactly the same as the regular xmm loop above, except the movaps | |
; stores are movntps and we use prefetchnta. We are prefetching in two places, each | |
; prefetch gets 64 bytes about half an iteration ahead of time (about 10 instructions | |
; lead time). When we come to the end of the memcpy, we'll be prefetching bytes | |
; beyond the buffer we need to copy from, which may not be valid bytes. This is | |
; not illegal; if the memory address is invalid it does not trap, the hardware treats | |
; illegal prefetches as nops. | |
; | |
XmmCopyLargeOuterNT: | |
movntps (-32)[rcx], xmm0 ; store 7th chunk from prior iteration | |
movntps (-16)[rcx], xmm1 ; store 8th chunk from prior iteration | |
XmmCopyLargeInnerNT: ; enter loop here with xmm0 preloaded. | |
prefetchnta [rcx + rdx + 512] ; prefetch several cache lines ahead | |
movups xmm0, [rcx + rdx] ; load first 16 byte chunk | |
movups xmm1, 16[rcx + rdx] ; load 2nd 16 byte chunk | |
add rcx, 128 ; advance destination address | |
movntps (-128)[rcx], xmm0 ; store first 16 byte chunk | |
movntps (-112)[rcx], xmm1 ; store 2nd 16 byte chunk | |
movups xmm0, (-96)[rcx + rdx] ; load 3rd chunk | |
movups xmm1, (-80)[rcx + rdx] ; load 4th chunk | |
dec r9 ; dec block counter (set cc for jnz) | |
movntps (-96)[rcx], xmm0 ; store 3rd chunk | |
movntps (-80)[rcx], xmm1 ; store 4th chunk | |
movups xmm0, (-64)[rcx + rdx] ; load 5th chunk | |
movups xmm1, (-48)[rcx + rdx] ; load 6th chunk | |
prefetchnta [rcx + rdx + 576] ; prefetch several cache lines ahead | |
movntps (-64)[rcx], xmm0 ; store 5th chunk | |
movntps (-48)[rcx], xmm1 ; store 6th chunk | |
movups xmm0, (-32)[rcx + rdx] ; load 7th chunk | |
movups xmm1, (-16)[rcx + rdx] ; load 8th chunk | |
jnz XmmCopyLargeOuterNT ; loop if more blocks | |
sfence | |
jmp XmmCopyFinish ; rejoin regular memcpy codepath | |
; | |
; The source address is less than the destination address. | |
; | |
align 16 | |
; | |
; Move bytes down using SSE registers. The source address is less than | |
; the destination address and the buffers overlap. We will do everything back-to-front. | |
; | |
; Preconditions: | |
; destination is r11 (must preserve for return value) and rcx | |
; source in r10 (must preserve for remainder move) | |
; length in r8, must have been verified to be greater than 16 | |
; offset from dest to src in rdx | |
; source addr < dest addr and the buffers overlap | |
; | |
CopyDown: | |
add rcx, r8 ; make rcx point one past the end of the dst buffer | |
movups xmm0, -16[rcx + rdx] ; load deferred bytes | |
sub rcx, 16 ; reduce dst addr | |
sub r8, 16 ; r8 -= 16 in case aligned | |
; | |
; Aligned stores using movaps or movups are faster on AMD hardware than unaligned | |
; stores using movups. To achieve 16-byte dest alignment, we do an unaligned move | |
; of the last 16 bytes of the buffers, then reduce rcx only by the amount necessary | |
; to achieve alignment. This results in some bytes getting copied twice, unless we're | |
; already aligned. | |
; | |
; We know the src address is less than the dst, but not by exactly how much. In the | |
; case where the difference is less than 16 we must be careful about the bytes | |
; that will be stored twice. We must do both loads before either store, or the | |
; second load of those bytes will get the wrong values. We handle this by | |
; deferring the store of 16 aligned bytes to the remainder code, so it can load the | |
; remainder before storing the deferred bytes. Since either or both of the two | |
; loops can be skipped, the preconditions needed by the remainder code must | |
; also apply to the loops. These conditions are: | |
; - r8 is the count remaining, not including the deferred bytes | |
; - [rcx] points one past the end of the remainder bytes | |
; - rdx is the offset from the dst to the source | |
; - xmm0 holds the 16 deferred bytes that need to be stored at [rcx] | |
; | |
test cl, 15 ; test if dest aligned | |
jz XmmMovLargeTest ; go try 128-byte blocks | |
; | |
; Move alignment bytes. | |
; | |
XmmMovAlign: | |
mov rax, rcx ; save unaligned store address | |
and rcx, -16 ; rcx is deferred store address | |
movups xmm1, xmm0 ; copy unaligned last bytes to xmm1 | |
movups xmm0, [rcx + rdx] ; load deferred-store bytes | |
movups [rax], xmm1 ; now safe to do unaligned store | |
mov r8, rcx ; easier to recalc r8 using rcx-r11 ... | |
sub r8, r11 ; ... than calc how much to subtract from r8 | |
; | |
; See if we can move any 128-byte blocks. | |
; | |
XmmMovLargeTest: | |
mov r9, r8 ; copy count of bytes remaining | |
shr r9, 7 ; compute number of 128-byte blocks | |
jz short XmmMovSmallTest ; if z jump around to 2nd loop | |
movaps [rcx], xmm0 ; going into 1st loop, ok to store deferred bytes | |
jmp short XmmMovLargeInner ; jump into 1st loop | |
; | |
; Move 128-byte blocks | |
; | |
align 16 | |
XmmMovLargeOuter: | |
movaps (128-112)[rcx], xmm0 ; store 7th chunk from prior iteration | |
movaps (128-128)[rcx], xmm1 ; store 8th chunk from prior iteration | |
XmmMovLargeInner: | |
movups xmm0, (-16)[rcx + rdx] ; load first 16 byte chunk | |
movups xmm1, (-32)[rcx + rdx] ; load 2nd 16 byte chunk | |
sub rcx, 128 ; reduce destination address | |
movaps (128-16)[rcx], xmm0 ; store first 16 byte chunk | |
movaps (128-32)[rcx], xmm1 ; store 2nd 16 byte chunk | |
movups xmm0, (128-48)[rcx + rdx] ; load 3rd chunk | |
movups xmm1, (128-64)[rcx + rdx] ; load 4th chunk | |
dec r9 ; dec block counter (set cc for jnz) | |
movaps (128-48)[rcx], xmm0 ; store 3rd chunk | |
movaps (128-64)[rcx], xmm1 ; store 4th chunk | |
movups xmm0, (128-80)[rcx + rdx] ; load 5th chunk | |
movups xmm1, (128-96)[rcx + rdx] ; load 6th chunk | |
movaps (128-80)[rcx], xmm0 ; store 5th chunk | |
movaps (128-96)[rcx], xmm1 ; store 6th chunk | |
movups xmm0, (128-112)[rcx + rdx] ; load 7th chunk | |
movups xmm1, (128-128)[rcx + rdx] ; load 8th chunk | |
jnz short XmmMovLargeOuter ; loop if more blocks | |
movaps (128-112)[rcx], xmm0 ; store 7th chunk from final iteration | |
and r8, 127 ; compute remaining byte count | |
movaps xmm0, xmm1 ; 8th chunk becomes deferred bytes | |
; | |
; See if we have any 16-byte blocks left to move | |
; | |
XmmMovSmallTest: | |
mov r9, r8 ; copy count of bytes remaining | |
shr r9, 4 ; compute number of 16-byte blocks | |
jz short XmmMovTrailing ; if z, no 16-byte blocks | |
align 16 | |
XmmMovSmallLoop: | |
movups [rcx], xmm0 ; the first time through this is the | |
; store of the deferred bytes from above | |
sub rcx, 16 ; reduce dest addr | |
movups xmm0, [rcx + rdx] ; load a block | |
dec r9 | |
jnz XmmMovSmallLoop | |
XmmMovTrailing: | |
and r8, 15 ; compute remaining byte count | |
jz short XmmMovReturn ; if z, no residual bytes to move | |
; | |
; Handle remainder bytes. | |
; | |
; As at the start, we are going to do an unaligned copy of 16 bytes which will double-write | |
; some bytes. We must not touch rcx or xmm0 because they have what we need to store the | |
; deferred block. But unlike for mcpyxmm code above, we have r10 and r11 we can just use | |
; to copy the lowest 16 bytes. | |
; | |
movups xmm1, [r10] ; load lowest 16 bytes, which includes remainder | |
movups [r11], xmm1 ; store lowest 16 bytes, which includes remainder | |
XmmMovReturn: | |
movups [rcx], xmm0 ; store deferred bytes | |
mov rax, r11 ; we must return destination address | |
ret | |
LEAF_END memcpy, _TEXT | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment