Created
January 26, 2010 00:11
-
-
Save stepancheg/286408 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* strcmp with SSE4.2 | |
Copyright (C) 2009 Free Software Foundation, Inc. | |
Contributed by Intel Corporation. | |
This file is part of the GNU C Library. | |
The GNU C Library is free software; you can redistribute it and/or | |
modify it under the terms of the GNU Lesser General Public | |
License as published by the Free Software Foundation; either | |
version 2.1 of the License, or (at your option) any later version. | |
The GNU C Library is distributed in the hope that it will be useful, | |
but WITHOUT ANY WARRANTY; without even the implied warranty of | |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
Lesser General Public License for more details. | |
You should have received a copy of the GNU Lesser General Public | |
License along with the GNU C Library; if not, write to the Free | |
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA | |
02111-1307 USA. */ | |
#include <sysdep.h> | |
#include <ifunc-defines.h> | |
#ifdef USE_AS_STRNCMP | |
/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz | |
if the new counter > the old one or is 0. */ | |
#define UPDATE_STRNCMP_COUNTER \ | |
/* calculate left number to compare */ \ | |
lea -16(%rcx, %r11), %r9; \ | |
cmp %r9, %r11; \ | |
jb LABEL(strcmp_exitz_sse4_2); \ | |
test %r9, %r9; \ | |
je LABEL(strcmp_exitz_sse4_2); \ | |
mov %r9, %r11 | |
#define STRCMP_SSE42 __strncmp_sse42 | |
#define STRCMP_SSSE3 __strncmp_ssse3 | |
#define STRCMP_SSE2 __strncmp_sse2 | |
#define __GI_STRCMP __GI_strncmp | |
#else | |
#define UPDATE_STRNCMP_COUNTER | |
#ifndef STRCMP | |
#define STRCMP strcmp | |
#define STRCMP_SSE42 __strcmp_sse42 | |
#define STRCMP_SSSE3 __strcmp_ssse3 | |
#define STRCMP_SSE2 __strcmp_sse2 | |
#define __GI_STRCMP __GI_strcmp | |
#endif | |
#endif | |
#ifndef LABEL | |
#define LABEL(l) L(l) | |
#endif | |
/* Define multiple versions only for the definition in libc. Don't | |
define multiple versions for strncmp in static library since we | |
need strncmp before the initialization happened. */ | |
#if (defined SHARED || !defined USE_AS_STRNCMP) && !defined NOT_IN_libc | |
.text | |
ENTRY(STRCMP) | |
.type STRCMP, @gnu_indirect_function | |
cmpl $0, __cpu_features+KIND_OFFSET(%rip) | |
jne 1f | |
call __init_cpu_features | |
1: | |
leaq STRCMP_SSE42(%rip), %rax | |
testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) | |
jnz 2f | |
leaq STRCMP_SSSE3(%rip), %rax | |
testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) | |
jnz 2f | |
leaq STRCMP_SSE2(%rip), %rax | |
2: ret | |
END(STRCMP) | |
/* We use 0x1a: | |
_SIDD_SBYTE_OPS | |
| _SIDD_CMP_EQUAL_EACH | |
| _SIDD_NEGATIVE_POLARITY | |
| _SIDD_LEAST_SIGNIFICANT | |
on pcmpistri to find out if two 16byte data elements are the same | |
and the offset of the first different byte. There are 4 cases: | |
1. Both 16byte data elements are valid and identical. | |
2. Both 16byte data elements have EOS and identical. | |
3. Both 16byte data elements are valid and they differ at offset X. | |
4. At least one 16byte data element has EOS at offset X. Two 16byte | |
data elements must differ at or before offset X. | |
Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases: | |
case ECX CFlag ZFlag SFlag | |
1 16 0 0 0 | |
2 16 0 1 1 | |
3 X 1 0 0 | |
4 0 <= X 1 0/1 0/1 | |
We exit from the loop for cases 2, 3 and 4 with jbe which branches | |
when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for | |
case 2. */ | |
/* Put all SSE 4.2 functions together. */ | |
.section .text.sse4.2,"ax",@progbits | |
.align 16 | |
.type STRCMP_SSE42, @function | |
STRCMP_SSE42: | |
cfi_startproc | |
CALL_MCOUNT | |
/* | |
* This implementation uses SSE to compare up to 16 bytes at a time. | |
*/ | |
#ifdef USE_AS_STRNCMP | |
test %rdx, %rdx | |
je LABEL(strcmp_exitz_sse4_2) | |
cmp $1, %rdx | |
je LABEL(Byte0_sse4_2) | |
mov %rdx, %r11 | |
#endif | |
mov %esi, %ecx | |
mov %edi, %eax | |
/* Use 64bit AND here to avoid long NOP padding. */ | |
and $0x3f, %rcx /* rsi alignment in cache line */ | |
and $0x3f, %rax /* rdi alignment in cache line */ | |
cmp $0x30, %ecx | |
ja LABEL(crosscache_sse4_2)/* rsi: 16-byte load will cross cache line */ | |
cmp $0x30, %eax | |
ja LABEL(crosscache_sse4_2)/* rdi: 16-byte load will cross cache line */ | |
movdqu (%rdi), %xmm1 | |
movdqu (%rsi), %xmm2 | |
pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ | |
pcmpeqb %xmm1, %xmm0 /* Any null chars? */ | |
pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ | |
psubb %xmm0, %xmm1 /* packed sub of comparison results*/ | |
pmovmskb %xmm1, %edx | |
sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ | |
jnz LABEL(less16bytes_sse4_2)/* If not, find different value or null char */ | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2)/* finish comparision */ | |
#endif | |
add $16, %rsi /* prepare to search next 16 bytes */ | |
add $16, %rdi /* prepare to search next 16 bytes */ | |
/* | |
* Determine source and destination string offsets from 16-byte alignment. | |
* Use relative offset difference between the two to determine which case | |
* below to use. | |
*/ | |
.p2align 4 | |
LABEL(crosscache_sse4_2): | |
and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ | |
and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ | |
mov $0xffff, %edx /* for equivalent offset */ | |
xor %r8d, %r8d | |
and $0xf, %ecx /* offset of rsi */ | |
and $0xf, %eax /* offset of rdi */ | |
cmp %eax, %ecx | |
je LABEL(ashr_0_sse4_2) /* rsi and rdi relative offset same */ | |
ja LABEL(bigger_sse4_2) | |
mov %edx, %r8d /* r8d is offset flag for exit tail */ | |
xchg %ecx, %eax | |
xchg %rsi, %rdi | |
LABEL(bigger_sse4_2): | |
lea 15(%rax), %r9 | |
sub %rcx, %r9 | |
lea LABEL(unaligned_table_sse4_2)(%rip), %r10 | |
movslq (%r10, %r9,4), %r9 | |
lea (%r10, %r9), %r10 | |
jmp *%r10 /* jump to corresponding case */ | |
/* | |
* The following cases will be handled by ashr_0 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(0~15) n(0~15) 15(15+ n-n) ashr_0 | |
*/ | |
.p2align 4 | |
LABEL(ashr_0_sse4_2): | |
movdqa (%rsi), %xmm1 | |
pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ | |
pcmpeqb %xmm1, %xmm0 /* Any null chars? */ | |
pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ | |
psubb %xmm0, %xmm1 /* packed sub of comparison results*/ | |
pmovmskb %xmm1, %r9d | |
shr %cl, %edx /* adjust 0xffff for offset */ | |
shr %cl, %r9d /* adjust for 16-byte offset */ | |
sub %r9d, %edx | |
/* | |
* edx must be the same with r9d if in left byte (16-rcx) is equal to | |
* the start from (16-rax) and no null char was seen. | |
*/ | |
jne LABEL(less32bytes_sse4_2) /* mismatch or null char */ | |
UPDATE_STRNCMP_COUNTER | |
mov $16, %rcx | |
mov $16, %r9 | |
pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ | |
/* | |
* Now both strings are aligned at 16-byte boundary. Loop over strings | |
* checking 32-bytes per iteration. | |
*/ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(ashr_0_use_sse4_2): | |
movdqa (%rdi,%rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
lea 16(%rdx), %rdx | |
jbe LABEL(ashr_0_use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
movdqa (%rdi,%rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
lea 16(%rdx), %rdx | |
jbe LABEL(ashr_0_use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
jmp LABEL(ashr_0_use_sse4_2) | |
.p2align 4 | |
LABEL(ashr_0_use_sse4_2_exit): | |
jnc LABEL(strcmp_exitz_sse4_2) | |
#ifdef USE_AS_STRNCMP | |
sub %rcx, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
lea -16(%rdx, %rcx), %rcx | |
movzbl (%rdi, %rcx), %eax | |
movzbl (%rsi, %rcx), %edx | |
sub %edx, %eax | |
ret | |
/* | |
* The following cases will be handled by ashr_1 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(15) n -15 0(15 +(n-15) - n) ashr_1 | |
*/ | |
.p2align 4 | |
LABEL(ashr_1_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 /* Any null chars? */ | |
pslldq $15, %xmm2 /* shift first string to align with second */ | |
pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ | |
psubb %xmm0, %xmm2 /* packed sub of comparison results*/ | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx /* adjust 0xffff for offset */ | |
shr %cl, %r9d /* adjust for 16-byte offset */ | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2)/* mismatch or null char seen */ | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads*/ | |
mov $1, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 1(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_1_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_1_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $1, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_1_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $1, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_1_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_1_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $1, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $14, %ecx | |
ja LABEL(loop_ashr_1_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_2 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(14~15) n -14 1(15 +(n-14) - n) ashr_2 | |
*/ | |
.p2align 4 | |
LABEL(ashr_2_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $14, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $2, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 2(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_2_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_2_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $2, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_2_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $2, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_2_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_2_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $2, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $13, %ecx | |
ja LABEL(loop_ashr_2_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_3 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(13~15) n -13 2(15 +(n-13) - n) ashr_3 | |
*/ | |
.p2align 4 | |
LABEL(ashr_3_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $13, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $3, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 3(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
LABEL(loop_ashr_3_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_3_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $3, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_3_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $3, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_3_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_3_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $3, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $12, %ecx | |
ja LABEL(loop_ashr_3_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_4 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(12~15) n -12 3(15 +(n-12) - n) ashr_4 | |
*/ | |
.p2align 4 | |
LABEL(ashr_4_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $12, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $4, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 4(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_4_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_4_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $4, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_4_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $4, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_4_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_4_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $4, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $11, %ecx | |
ja LABEL(loop_ashr_4_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_5 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 | |
*/ | |
.p2align 4 | |
LABEL(ashr_5_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $11, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $5, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 5(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_5_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_5_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $5, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_5_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $5, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_5_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_5_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $5, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $10, %ecx | |
ja LABEL(loop_ashr_5_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_6 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 | |
*/ | |
.p2align 4 | |
LABEL(ashr_6_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $10, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $6, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 6(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_6_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_6_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $6, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_6_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $6, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_6_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_6_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $6, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $9, %ecx | |
ja LABEL(loop_ashr_6_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_7 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 | |
*/ | |
.p2align 4 | |
LABEL(ashr_7_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $9, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $7, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 7(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_7_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_7_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $7, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_7_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $7, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_7_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_7_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $7, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $8, %ecx | |
ja LABEL(loop_ashr_7_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_8 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 | |
*/ | |
.p2align 4 | |
LABEL(ashr_8_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $8, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $8, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 8(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_8_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_8_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $8, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_8_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $8, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_8_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_8_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $8, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $7, %ecx | |
ja LABEL(loop_ashr_8_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_9 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 | |
*/ | |
.p2align 4 | |
LABEL(ashr_9_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $7, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $9, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 9(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_9_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_9_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $9, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_9_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $9, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_9_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_9_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $9, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $6, %ecx | |
ja LABEL(loop_ashr_9_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_10 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 | |
*/ | |
.p2align 4 | |
LABEL(ashr_10_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $6, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $10, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 10(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_10_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_10_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $10, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_10_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $10, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_10_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_10_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $10, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $5, %ecx | |
ja LABEL(loop_ashr_10_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_11 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 | |
*/ | |
.p2align 4 | |
LABEL(ashr_11_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $5, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $11, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 11(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_11_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_11_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $11, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_11_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $11, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_11_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_11_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $11, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $4, %ecx | |
ja LABEL(loop_ashr_11_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_12 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 | |
*/ | |
.p2align 4 | |
LABEL(ashr_12_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $4, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $12, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 12(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_12_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_12_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $12, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_12_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $12, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_12_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_12_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $12, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $3, %ecx | |
ja LABEL(loop_ashr_12_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_13 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 | |
*/ | |
.p2align 4 | |
LABEL(ashr_13_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $3, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $13, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 13(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_13_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_13_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $13, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_13_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $13, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_13_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_13_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $13, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $2, %ecx | |
ja LABEL(loop_ashr_13_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_14 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 | |
*/ | |
.p2align 4 | |
LABEL(ashr_14_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $2, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $14, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 14(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_14_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_14_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $14, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_14_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $14, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_14_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_14_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $14, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $1, %ecx | |
ja LABEL(loop_ashr_14_use_sse4_2) | |
jmp LABEL(nibble_ashr_use_sse4_2_exit) | |
/* | |
* The following cases will be handled by ashr_15 | |
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case | |
* n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 | |
*/ | |
.p2align 4 | |
LABEL(ashr_15_sse4_2): | |
pxor %xmm0, %xmm0 | |
movdqa (%rdi), %xmm2 | |
movdqa (%rsi), %xmm1 | |
pcmpeqb %xmm1, %xmm0 | |
pslldq $1, %xmm2 | |
pcmpeqb %xmm1, %xmm2 | |
psubb %xmm0, %xmm2 | |
pmovmskb %xmm2, %r9d | |
shr %cl, %edx | |
shr %cl, %r9d | |
sub %r9d, %edx | |
jnz LABEL(less32bytes_sse4_2) | |
movdqa (%rdi), %xmm3 | |
UPDATE_STRNCMP_COUNTER | |
pxor %xmm0, %xmm0 | |
mov $16, %rcx /* index for loads */ | |
mov $15, %r9d /* byte position left over from less32bytes case */ | |
/* | |
* Setup %r10 value allows us to detect crossing a page boundary. | |
* When %r10 goes positive we have crossed a page boundary and | |
* need to do a nibble. | |
*/ | |
lea 15(%rdi), %r10 | |
and $0xfff, %r10 /* offset into 4K page */ | |
sub $0x1000, %r10 /* subtract 4K pagesize */ | |
mov %rcx, %rdx /* only for offset of sse4 instruction loop*/ | |
.p2align 4 | |
LABEL(loop_ashr_15_use_sse4_2): | |
add $16, %r10 | |
jg LABEL(nibble_ashr_15_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $15, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
add $16, %r10 | |
jg LABEL(nibble_ashr_15_use_sse4_2) | |
movdqa (%rdi, %rdx), %xmm0 | |
palignr $15, -16(%rdi, %rdx), %xmm0 | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
jbe LABEL(use_sse4_2_exit) | |
#ifdef USE_AS_STRNCMP | |
sub $16, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add $16, %rdx | |
jmp LABEL(loop_ashr_15_use_sse4_2) | |
.p2align 4 | |
LABEL(nibble_ashr_15_use_sse4_2): | |
sub $0x1000, %r10 | |
movdqa -16(%rdi, %rdx), %xmm0 | |
psrldq $15, %xmm0 | |
pcmpistri $0x3a,%xmm0, %xmm0 | |
#ifdef USE_AS_STRNCMP | |
cmp %r11, %rcx | |
jae LABEL(nibble_ashr_use_sse4_2_exit) | |
#endif | |
cmp $0, %ecx | |
ja LABEL(loop_ashr_15_use_sse4_2) | |
LABEL(nibble_ashr_use_sse4_2_exit): | |
pcmpistri $0x1a,(%rsi,%rdx), %xmm0 | |
.p2align 4 | |
LABEL(use_sse4_2_exit): | |
jnc LABEL(strcmp_exitz_sse4_2) | |
#ifdef USE_AS_STRNCMP | |
sub %rcx, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
add %rcx, %rdx | |
lea -16(%rdi, %r9), %rdi | |
movzbl (%rdi, %rdx), %eax | |
movzbl (%rsi, %rdx), %edx | |
test %r8d, %r8d | |
jz LABEL(use_sse4_2_ret_sse4_2) | |
xchg %eax, %edx | |
LABEL(use_sse4_2_ret_sse4_2): | |
sub %edx, %eax | |
ret | |
LABEL(less32bytes_sse4_2): | |
lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ | |
lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ | |
test %r8d, %r8d | |
jz LABEL(ret_sse4_2) | |
xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ | |
.p2align 4 | |
LABEL(ret_sse4_2): | |
LABEL(less16bytes_sse4_2): | |
bsf %rdx, %rdx /* find and store bit index in %rdx */ | |
#ifdef USE_AS_STRNCMP | |
sub %rdx, %r11 | |
jbe LABEL(strcmp_exitz_sse4_2) | |
#endif | |
movzbl (%rsi, %rdx), %ecx | |
movzbl (%rdi, %rdx), %eax | |
sub %ecx, %eax | |
ret | |
LABEL(strcmp_exitz_sse4_2): | |
xor %eax, %eax | |
ret | |
.p2align 4 | |
LABEL(Byte0_sse4_2): | |
movzx (%rsi), %ecx | |
movzx (%rdi), %eax | |
sub %ecx, %eax | |
ret | |
cfi_endproc | |
.size STRCMP_SSE42, .-STRCMP_SSE42 | |
/* Put all SSE 4.2 functions together. */ | |
.section .rodata.sse4.2,"a",@progbits | |
.p2align 3 | |
LABEL(unaligned_table_sse4_2): | |
.int LABEL(ashr_1_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_2_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_3_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_4_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_5_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_6_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_7_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_8_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_9_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_10_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_11_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_12_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_13_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_14_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_15_sse4_2) - LABEL(unaligned_table_sse4_2) | |
.int LABEL(ashr_0_sse4_2) - LABEL(unaligned_table_sse4_2) | |
# undef ENTRY | |
# define ENTRY(name) \ | |
.type STRCMP_SSE2, @function; \ | |
.align 16; \ | |
STRCMP_SSE2: cfi_startproc; \ | |
CALL_MCOUNT | |
# undef END | |
# define END(name) \ | |
cfi_endproc; .size STRCMP_SSE2, .-STRCMP_SSE2 | |
# undef libc_hidden_builtin_def | |
/* It doesn't make sense to send libc-internal strcmp calls through a PLT. | |
The speedup we get from using SSE4.2 instruction is likely eaten away | |
by the indirect call in the PLT. */ | |
# define libc_hidden_builtin_def(name) \ | |
.globl __GI_STRCMP; __GI_STRCMP = STRCMP_SSE2 | |
#endif | |
#include "../strcmp.S" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment