1 /* strcmp optimized with SSE4.2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (2)
25 # define STRCMP_ISA _sse42
26 # include "strcmp-naming.h"
28 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
29 # include "locale-defines.h"
32 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
33 /* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
34 if the new counter > the old one or is 0. */
35 # define UPDATE_STRNCMP_COUNTER \
36 /* calculate left number to compare */ \
37 lea -16(%rcx, %r11), %r9; \
39 jb LABEL(strcmp_exitz); \
41 je LABEL(strcmp_exitz); \
44 # define UPDATE_STRNCMP_COUNTER
47 # define SECTION sse4.2
49 # define LABEL(l) .L##l
53 | _SIDD_CMP_EQUAL_EACH
54 | _SIDD_NEGATIVE_POLARITY
55 | _SIDD_LEAST_SIGNIFICANT
56 on pcmpistri to find out if two 16byte data elements are the same
57 and the offset of the first different byte. There are 4 cases:
59 1. Both 16byte data elements are valid and identical.
60 2. Both 16byte data elements have EOS and identical.
61 3. Both 16byte data elements are valid and they differ at offset X.
62 4. At least one 16byte data element has EOS at offset X. Two 16byte
63 data elements must differ at or before offset X.
65 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
67 case ECX CFlag ZFlag SFlag
73 We exit from the loop for cases 2, 3 and 4 with jbe which branches
74 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
77 /* Put all SSE 4.2 functions together. */
78 .section .text.SECTION,"ax",@progbits
80 .type STRCMP, @function
82 # ifdef USE_AS_STRCASECMP_L
84 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
85 mov %fs:(%rax),%RDX_LP
87 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
90 /* FALLTHROUGH to strcasecmp_l. */
92 # ifdef USE_AS_STRNCASECMP_L
94 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
95 mov %fs:(%rax),%RCX_LP
97 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
100 /* FALLTHROUGH to strncasecmp_l. */
112 * This implementation uses SSE to compare up to 16 bytes at a time.
114 # ifdef USE_AS_STRCASECMP_L
115 /* We have to fall back on the C implementation for locales
116 with encodings not matching ASCII for single bytes. */
117 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
118 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
122 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
123 jne __strcasecmp_l_nonascii
125 # ifdef USE_AS_STRNCASECMP_L
126 /* We have to fall back on the C implementation for locales
127 with encodings not matching ASCII for single bytes. */
128 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
129 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
133 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
134 jne __strncasecmp_l_nonascii
137 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
138 test %RDX_LP, %RDX_LP
139 je LABEL(strcmp_exitz)
146 /* Use 64bit AND here to avoid long NOP padding. */
147 and $0x3f, %rcx /* rsi alignment in cache line */
148 and $0x3f, %rax /* rdi alignment in cache line */
149 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
150 .section .rodata.cst16,"aM",@progbits,16
153 .quad 0x3f3f3f3f3f3f3f3f
154 .quad 0x3f3f3f3f3f3f3f3f
156 .quad 0x9999999999999999
157 .quad 0x9999999999999999
159 .quad 0x2020202020202020
160 .quad 0x2020202020202020
162 movdqa LABEL(lcase_min)(%rip), %xmm4
163 # define LCASE_MIN_reg %xmm4
164 movdqa LABEL(lcase_max)(%rip), %xmm5
165 # define LCASE_MAX_reg %xmm5
166 movdqa LABEL(case_add)(%rip), %xmm6
167 # define CASE_ADD_reg %xmm6
170 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
172 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
175 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
176 # define TOLOWER(reg1, reg2) \
177 movdqa LCASE_MIN_reg, %xmm7; \
178 movdqa LCASE_MIN_reg, %xmm8; \
181 pcmpgtb LCASE_MAX_reg, %xmm7; \
182 pcmpgtb LCASE_MAX_reg, %xmm8; \
183 pandn CASE_ADD_reg, %xmm7; \
184 pandn CASE_ADD_reg, %xmm8; \
188 TOLOWER (%xmm1, %xmm2)
190 # define TOLOWER(reg1, reg2)
192 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
193 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
194 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
195 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
197 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
198 jnz LABEL(less16bytes)/* If not, find different value or null char */
199 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
201 jbe LABEL(strcmp_exitz)/* finish comparison */
203 add $16, %rsi /* prepare to search next 16 bytes */
204 add $16, %rdi /* prepare to search next 16 bytes */
207 * Determine source and destination string offsets from 16-byte
208 * alignment. Use relative offset difference between the two to
209 * determine which case below to use.
213 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
214 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
215 mov $0xffff, %edx /* for equivalent offset */
217 and $0xf, %ecx /* offset of rsi */
218 and $0xf, %eax /* offset of rdi */
219 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
221 je LABEL(ashr_0) /* rsi and rdi relative offset same */
223 mov %edx, %r8d /* r8d is offset flag for exit tail */
231 lea LABEL(unaligned_table)(%rip), %r10
232 movslq (%r10, %r9,4), %r9
233 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
234 lea (%r10, %r9), %r10
235 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
238 * The following cases will be handled by ashr_0
239 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
240 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
246 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
247 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
248 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
251 TOLOWER (%xmm1, %xmm2)
252 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
254 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
256 shr %cl, %edx /* adjust 0xffff for offset */
257 shr %cl, %r9d /* adjust for 16-byte offset */
260 * edx must be the same with r9d if in left byte (16-rcx) is equal to
261 * the start from (16-rax) and no null char was seen.
263 jne LABEL(less32bytes) /* mismatch or null char */
264 UPDATE_STRNCMP_COUNTER
269 * Now both strings are aligned at 16-byte boundary. Loop over strings
270 * checking 32-bytes per iteration.
272 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
275 movdqa (%rdi,%rdx), %xmm0
276 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
277 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
279 movdqa (%rsi,%rdx), %xmm1
280 TOLOWER (%xmm0, %xmm1)
281 pcmpistri $0x1a, %xmm1, %xmm0
284 jbe LABEL(ashr_0_exit_use)
285 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
287 jbe LABEL(strcmp_exitz)
290 movdqa (%rdi,%rdx), %xmm0
291 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
292 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
294 movdqa (%rsi,%rdx), %xmm1
295 TOLOWER (%xmm0, %xmm1)
296 pcmpistri $0x1a, %xmm1, %xmm0
299 jbe LABEL(ashr_0_exit_use)
300 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
302 jbe LABEL(strcmp_exitz)
304 jmp LABEL(ashr_0_use)
308 LABEL(ashr_0_exit_use):
309 jnc LABEL(strcmp_exitz)
310 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
312 jbe LABEL(strcmp_exitz)
314 lea -16(%rdx, %rcx), %rcx
315 movzbl (%rdi, %rcx), %eax
316 movzbl (%rsi, %rcx), %edx
317 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
318 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
319 movl (%rcx,%rax,4), %eax
320 movl (%rcx,%rdx,4), %edx
328 * The following cases will be handled by ashr_1
329 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
330 * n(15) n -15 0(15 +(n-15) - n) ashr_1
334 pslldq $15, %xmm2 /* shift first string to align with second */
335 TOLOWER (%xmm1, %xmm2)
336 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
337 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
339 shr %cl, %edx /* adjust 0xffff for offset */
340 shr %cl, %r9d /* adjust for 16-byte offset */
342 jnz LABEL(less32bytes) /* mismatch or null char seen */
344 UPDATE_STRNCMP_COUNTER
346 mov $16, %rcx /* index for loads*/
347 mov $1, %r9d /* byte position left over from less32bytes case */
349 * Setup %r10 value allows us to detect crossing a page boundary.
350 * When %r10 goes positive we have crossed a page boundary and
351 * need to do a nibble.
354 and $0xfff, %r10 /* offset into 4K page */
355 sub $0x1000, %r10 /* subtract 4K pagesize */
356 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
359 LABEL(loop_ashr_1_use):
361 jg LABEL(nibble_ashr_1_use)
363 LABEL(nibble_ashr_1_restart_use):
364 movdqa (%rdi, %rdx), %xmm0
365 palignr $1, -16(%rdi, %rdx), %xmm0
366 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
367 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
369 movdqa (%rsi,%rdx), %xmm1
370 TOLOWER (%xmm0, %xmm1)
371 pcmpistri $0x1a, %xmm1, %xmm0
374 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
376 jbe LABEL(strcmp_exitz)
381 jg LABEL(nibble_ashr_1_use)
383 movdqa (%rdi, %rdx), %xmm0
384 palignr $1, -16(%rdi, %rdx), %xmm0
385 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
386 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
388 movdqa (%rsi,%rdx), %xmm1
389 TOLOWER (%xmm0, %xmm1)
390 pcmpistri $0x1a, %xmm1, %xmm0
393 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
395 jbe LABEL(strcmp_exitz)
398 jmp LABEL(loop_ashr_1_use)
401 LABEL(nibble_ashr_1_use):
403 movdqa -16(%rdi, %rdx), %xmm0
405 pcmpistri $0x3a,%xmm0, %xmm0
406 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
408 jae LABEL(nibble_ashr_exit_use)
411 ja LABEL(nibble_ashr_1_restart_use)
413 jmp LABEL(nibble_ashr_exit_use)
416 * The following cases will be handled by ashr_2
417 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
418 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
423 TOLOWER (%xmm1, %xmm2)
430 jnz LABEL(less32bytes)
432 UPDATE_STRNCMP_COUNTER
434 mov $16, %rcx /* index for loads */
435 mov $2, %r9d /* byte position left over from less32bytes case */
437 * Setup %r10 value allows us to detect crossing a page boundary.
438 * When %r10 goes positive we have crossed a page boundary and
439 * need to do a nibble.
442 and $0xfff, %r10 /* offset into 4K page */
443 sub $0x1000, %r10 /* subtract 4K pagesize */
444 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
447 LABEL(loop_ashr_2_use):
449 jg LABEL(nibble_ashr_2_use)
451 LABEL(nibble_ashr_2_restart_use):
452 movdqa (%rdi, %rdx), %xmm0
453 palignr $2, -16(%rdi, %rdx), %xmm0
454 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
455 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
457 movdqa (%rsi,%rdx), %xmm1
458 TOLOWER (%xmm0, %xmm1)
459 pcmpistri $0x1a, %xmm1, %xmm0
462 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
464 jbe LABEL(strcmp_exitz)
469 jg LABEL(nibble_ashr_2_use)
471 movdqa (%rdi, %rdx), %xmm0
472 palignr $2, -16(%rdi, %rdx), %xmm0
473 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
474 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
476 movdqa (%rsi,%rdx), %xmm1
477 TOLOWER (%xmm0, %xmm1)
478 pcmpistri $0x1a, %xmm1, %xmm0
481 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
483 jbe LABEL(strcmp_exitz)
486 jmp LABEL(loop_ashr_2_use)
489 LABEL(nibble_ashr_2_use):
491 movdqa -16(%rdi, %rdx), %xmm0
493 pcmpistri $0x3a,%xmm0, %xmm0
494 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
496 jae LABEL(nibble_ashr_exit_use)
499 ja LABEL(nibble_ashr_2_restart_use)
501 jmp LABEL(nibble_ashr_exit_use)
504 * The following cases will be handled by ashr_3
505 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
506 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
511 TOLOWER (%xmm1, %xmm2)
518 jnz LABEL(less32bytes)
521 UPDATE_STRNCMP_COUNTER
523 mov $16, %rcx /* index for loads */
524 mov $3, %r9d /* byte position left over from less32bytes case */
526 * Setup %r10 value allows us to detect crossing a page boundary.
527 * When %r10 goes positive we have crossed a page boundary and
528 * need to do a nibble.
531 and $0xfff, %r10 /* offset into 4K page */
532 sub $0x1000, %r10 /* subtract 4K pagesize */
533 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
535 LABEL(loop_ashr_3_use):
537 jg LABEL(nibble_ashr_3_use)
539 LABEL(nibble_ashr_3_restart_use):
540 movdqa (%rdi, %rdx), %xmm0
541 palignr $3, -16(%rdi, %rdx), %xmm0
542 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
543 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
545 movdqa (%rsi,%rdx), %xmm1
546 TOLOWER (%xmm0, %xmm1)
547 pcmpistri $0x1a, %xmm1, %xmm0
550 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
552 jbe LABEL(strcmp_exitz)
557 jg LABEL(nibble_ashr_3_use)
559 movdqa (%rdi, %rdx), %xmm0
560 palignr $3, -16(%rdi, %rdx), %xmm0
561 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
562 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
564 movdqa (%rsi,%rdx), %xmm1
565 TOLOWER (%xmm0, %xmm1)
566 pcmpistri $0x1a, %xmm1, %xmm0
569 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
571 jbe LABEL(strcmp_exitz)
574 jmp LABEL(loop_ashr_3_use)
577 LABEL(nibble_ashr_3_use):
579 movdqa -16(%rdi, %rdx), %xmm0
581 pcmpistri $0x3a,%xmm0, %xmm0
582 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
584 jae LABEL(nibble_ashr_exit_use)
587 ja LABEL(nibble_ashr_3_restart_use)
589 jmp LABEL(nibble_ashr_exit_use)
592 * The following cases will be handled by ashr_4
593 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
594 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
599 TOLOWER (%xmm1, %xmm2)
606 jnz LABEL(less32bytes)
609 UPDATE_STRNCMP_COUNTER
611 mov $16, %rcx /* index for loads */
612 mov $4, %r9d /* byte position left over from less32bytes case */
614 * Setup %r10 value allows us to detect crossing a page boundary.
615 * When %r10 goes positive we have crossed a page boundary and
616 * need to do a nibble.
619 and $0xfff, %r10 /* offset into 4K page */
620 sub $0x1000, %r10 /* subtract 4K pagesize */
621 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
624 LABEL(loop_ashr_4_use):
626 jg LABEL(nibble_ashr_4_use)
628 LABEL(nibble_ashr_4_restart_use):
629 movdqa (%rdi, %rdx), %xmm0
630 palignr $4, -16(%rdi, %rdx), %xmm0
631 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
632 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
634 movdqa (%rsi,%rdx), %xmm1
635 TOLOWER (%xmm0, %xmm1)
636 pcmpistri $0x1a, %xmm1, %xmm0
639 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
641 jbe LABEL(strcmp_exitz)
646 jg LABEL(nibble_ashr_4_use)
648 movdqa (%rdi, %rdx), %xmm0
649 palignr $4, -16(%rdi, %rdx), %xmm0
650 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
651 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
653 movdqa (%rsi,%rdx), %xmm1
654 TOLOWER (%xmm0, %xmm1)
655 pcmpistri $0x1a, %xmm1, %xmm0
658 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
660 jbe LABEL(strcmp_exitz)
663 jmp LABEL(loop_ashr_4_use)
666 LABEL(nibble_ashr_4_use):
668 movdqa -16(%rdi, %rdx), %xmm0
670 pcmpistri $0x3a,%xmm0, %xmm0
671 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
673 jae LABEL(nibble_ashr_exit_use)
676 ja LABEL(nibble_ashr_4_restart_use)
678 jmp LABEL(nibble_ashr_exit_use)
681 * The following cases will be handled by ashr_5
682 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
683 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
688 TOLOWER (%xmm1, %xmm2)
695 jnz LABEL(less32bytes)
698 UPDATE_STRNCMP_COUNTER
700 mov $16, %rcx /* index for loads */
701 mov $5, %r9d /* byte position left over from less32bytes case */
703 * Setup %r10 value allows us to detect crossing a page boundary.
704 * When %r10 goes positive we have crossed a page boundary and
705 * need to do a nibble.
708 and $0xfff, %r10 /* offset into 4K page */
709 sub $0x1000, %r10 /* subtract 4K pagesize */
710 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
713 LABEL(loop_ashr_5_use):
715 jg LABEL(nibble_ashr_5_use)
717 LABEL(nibble_ashr_5_restart_use):
718 movdqa (%rdi, %rdx), %xmm0
719 palignr $5, -16(%rdi, %rdx), %xmm0
720 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
721 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
723 movdqa (%rsi,%rdx), %xmm1
724 TOLOWER (%xmm0, %xmm1)
725 pcmpistri $0x1a, %xmm1, %xmm0
728 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
730 jbe LABEL(strcmp_exitz)
735 jg LABEL(nibble_ashr_5_use)
737 movdqa (%rdi, %rdx), %xmm0
739 palignr $5, -16(%rdi, %rdx), %xmm0
740 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
741 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
743 movdqa (%rsi,%rdx), %xmm1
744 TOLOWER (%xmm0, %xmm1)
745 pcmpistri $0x1a, %xmm1, %xmm0
748 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
750 jbe LABEL(strcmp_exitz)
753 jmp LABEL(loop_ashr_5_use)
756 LABEL(nibble_ashr_5_use):
758 movdqa -16(%rdi, %rdx), %xmm0
760 pcmpistri $0x3a,%xmm0, %xmm0
761 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
763 jae LABEL(nibble_ashr_exit_use)
766 ja LABEL(nibble_ashr_5_restart_use)
768 jmp LABEL(nibble_ashr_exit_use)
771 * The following cases will be handled by ashr_6
772 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
773 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
778 TOLOWER (%xmm1, %xmm2)
785 jnz LABEL(less32bytes)
788 UPDATE_STRNCMP_COUNTER
790 mov $16, %rcx /* index for loads */
791 mov $6, %r9d /* byte position left over from less32bytes case */
793 * Setup %r10 value allows us to detect crossing a page boundary.
794 * When %r10 goes positive we have crossed a page boundary and
795 * need to do a nibble.
798 and $0xfff, %r10 /* offset into 4K page */
799 sub $0x1000, %r10 /* subtract 4K pagesize */
800 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
803 LABEL(loop_ashr_6_use):
805 jg LABEL(nibble_ashr_6_use)
807 LABEL(nibble_ashr_6_restart_use):
808 movdqa (%rdi, %rdx), %xmm0
809 palignr $6, -16(%rdi, %rdx), %xmm0
810 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
811 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
813 movdqa (%rsi,%rdx), %xmm1
814 TOLOWER (%xmm0, %xmm1)
815 pcmpistri $0x1a, %xmm1, %xmm0
818 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
820 jbe LABEL(strcmp_exitz)
825 jg LABEL(nibble_ashr_6_use)
827 movdqa (%rdi, %rdx), %xmm0
828 palignr $6, -16(%rdi, %rdx), %xmm0
829 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
830 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
832 movdqa (%rsi,%rdx), %xmm1
833 TOLOWER (%xmm0, %xmm1)
834 pcmpistri $0x1a, %xmm1, %xmm0
837 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
839 jbe LABEL(strcmp_exitz)
842 jmp LABEL(loop_ashr_6_use)
845 LABEL(nibble_ashr_6_use):
847 movdqa -16(%rdi, %rdx), %xmm0
849 pcmpistri $0x3a,%xmm0, %xmm0
850 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
852 jae LABEL(nibble_ashr_exit_use)
855 ja LABEL(nibble_ashr_6_restart_use)
857 jmp LABEL(nibble_ashr_exit_use)
860 * The following cases will be handled by ashr_7
861 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
862 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
867 TOLOWER (%xmm1, %xmm2)
874 jnz LABEL(less32bytes)
877 UPDATE_STRNCMP_COUNTER
879 mov $16, %rcx /* index for loads */
880 mov $7, %r9d /* byte position left over from less32bytes case */
882 * Setup %r10 value allows us to detect crossing a page boundary.
883 * When %r10 goes positive we have crossed a page boundary and
884 * need to do a nibble.
887 and $0xfff, %r10 /* offset into 4K page */
888 sub $0x1000, %r10 /* subtract 4K pagesize */
889 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
892 LABEL(loop_ashr_7_use):
894 jg LABEL(nibble_ashr_7_use)
896 LABEL(nibble_ashr_7_restart_use):
897 movdqa (%rdi, %rdx), %xmm0
898 palignr $7, -16(%rdi, %rdx), %xmm0
899 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
900 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
902 movdqa (%rsi,%rdx), %xmm1
903 TOLOWER (%xmm0, %xmm1)
904 pcmpistri $0x1a, %xmm1, %xmm0
907 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
909 jbe LABEL(strcmp_exitz)
914 jg LABEL(nibble_ashr_7_use)
916 movdqa (%rdi, %rdx), %xmm0
917 palignr $7, -16(%rdi, %rdx), %xmm0
918 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
919 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
921 movdqa (%rsi,%rdx), %xmm1
922 TOLOWER (%xmm0, %xmm1)
923 pcmpistri $0x1a, %xmm1, %xmm0
926 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
928 jbe LABEL(strcmp_exitz)
931 jmp LABEL(loop_ashr_7_use)
934 LABEL(nibble_ashr_7_use):
936 movdqa -16(%rdi, %rdx), %xmm0
938 pcmpistri $0x3a,%xmm0, %xmm0
939 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
941 jae LABEL(nibble_ashr_exit_use)
944 ja LABEL(nibble_ashr_7_restart_use)
946 jmp LABEL(nibble_ashr_exit_use)
949 * The following cases will be handled by ashr_8
950 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
951 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
956 TOLOWER (%xmm1, %xmm2)
963 jnz LABEL(less32bytes)
966 UPDATE_STRNCMP_COUNTER
968 mov $16, %rcx /* index for loads */
969 mov $8, %r9d /* byte position left over from less32bytes case */
971 * Setup %r10 value allows us to detect crossing a page boundary.
972 * When %r10 goes positive we have crossed a page boundary and
973 * need to do a nibble.
976 and $0xfff, %r10 /* offset into 4K page */
977 sub $0x1000, %r10 /* subtract 4K pagesize */
978 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
981 LABEL(loop_ashr_8_use):
983 jg LABEL(nibble_ashr_8_use)
985 LABEL(nibble_ashr_8_restart_use):
986 movdqa (%rdi, %rdx), %xmm0
987 palignr $8, -16(%rdi, %rdx), %xmm0
988 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
989 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
991 movdqa (%rsi,%rdx), %xmm1
992 TOLOWER (%xmm0, %xmm1)
993 pcmpistri $0x1a, %xmm1, %xmm0
996 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
998 jbe LABEL(strcmp_exitz)
1003 jg LABEL(nibble_ashr_8_use)
1005 movdqa (%rdi, %rdx), %xmm0
1006 palignr $8, -16(%rdi, %rdx), %xmm0
1007 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1008 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1010 movdqa (%rsi,%rdx), %xmm1
1011 TOLOWER (%xmm0, %xmm1)
1012 pcmpistri $0x1a, %xmm1, %xmm0
1015 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1017 jbe LABEL(strcmp_exitz)
1020 jmp LABEL(loop_ashr_8_use)
1023 LABEL(nibble_ashr_8_use):
1025 movdqa -16(%rdi, %rdx), %xmm0
1027 pcmpistri $0x3a,%xmm0, %xmm0
1028 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1030 jae LABEL(nibble_ashr_exit_use)
1033 ja LABEL(nibble_ashr_8_restart_use)
1035 jmp LABEL(nibble_ashr_exit_use)
1038 * The following cases will be handled by ashr_9
1039 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1040 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1045 TOLOWER (%xmm1, %xmm2)
1046 pcmpeqb %xmm1, %xmm2
1048 pmovmskb %xmm2, %r9d
1052 jnz LABEL(less32bytes)
1053 movdqa (%rdi), %xmm3
1055 UPDATE_STRNCMP_COUNTER
1057 mov $16, %rcx /* index for loads */
1058 mov $9, %r9d /* byte position left over from less32bytes case */
1060 * Setup %r10 value allows us to detect crossing a page boundary.
1061 * When %r10 goes positive we have crossed a page boundary and
1062 * need to do a nibble.
1065 and $0xfff, %r10 /* offset into 4K page */
1066 sub $0x1000, %r10 /* subtract 4K pagesize */
1067 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1070 LABEL(loop_ashr_9_use):
1072 jg LABEL(nibble_ashr_9_use)
1074 LABEL(nibble_ashr_9_restart_use):
1075 movdqa (%rdi, %rdx), %xmm0
1077 palignr $9, -16(%rdi, %rdx), %xmm0
1078 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1079 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1081 movdqa (%rsi,%rdx), %xmm1
1082 TOLOWER (%xmm0, %xmm1)
1083 pcmpistri $0x1a, %xmm1, %xmm0
1086 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1088 jbe LABEL(strcmp_exitz)
1093 jg LABEL(nibble_ashr_9_use)
1095 movdqa (%rdi, %rdx), %xmm0
1096 palignr $9, -16(%rdi, %rdx), %xmm0
1097 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1098 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1100 movdqa (%rsi,%rdx), %xmm1
1101 TOLOWER (%xmm0, %xmm1)
1102 pcmpistri $0x1a, %xmm1, %xmm0
1105 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1107 jbe LABEL(strcmp_exitz)
1110 jmp LABEL(loop_ashr_9_use)
1113 LABEL(nibble_ashr_9_use):
1115 movdqa -16(%rdi, %rdx), %xmm0
1117 pcmpistri $0x3a,%xmm0, %xmm0
1118 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1120 jae LABEL(nibble_ashr_exit_use)
1123 ja LABEL(nibble_ashr_9_restart_use)
1125 jmp LABEL(nibble_ashr_exit_use)
1128 * The following cases will be handled by ashr_10
1129 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1130 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1135 TOLOWER (%xmm1, %xmm2)
1136 pcmpeqb %xmm1, %xmm2
1138 pmovmskb %xmm2, %r9d
1142 jnz LABEL(less32bytes)
1143 movdqa (%rdi), %xmm3
1145 UPDATE_STRNCMP_COUNTER
1147 mov $16, %rcx /* index for loads */
1148 mov $10, %r9d /* byte position left over from less32bytes case */
1150 * Setup %r10 value allows us to detect crossing a page boundary.
1151 * When %r10 goes positive we have crossed a page boundary and
1152 * need to do a nibble.
1155 and $0xfff, %r10 /* offset into 4K page */
1156 sub $0x1000, %r10 /* subtract 4K pagesize */
1157 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1160 LABEL(loop_ashr_10_use):
1162 jg LABEL(nibble_ashr_10_use)
1164 LABEL(nibble_ashr_10_restart_use):
1165 movdqa (%rdi, %rdx), %xmm0
1166 palignr $10, -16(%rdi, %rdx), %xmm0
1167 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1168 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1170 movdqa (%rsi,%rdx), %xmm1
1171 TOLOWER (%xmm0, %xmm1)
1172 pcmpistri $0x1a, %xmm1, %xmm0
1175 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1177 jbe LABEL(strcmp_exitz)
1182 jg LABEL(nibble_ashr_10_use)
1184 movdqa (%rdi, %rdx), %xmm0
1185 palignr $10, -16(%rdi, %rdx), %xmm0
1186 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1187 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1189 movdqa (%rsi,%rdx), %xmm1
1190 TOLOWER (%xmm0, %xmm1)
1191 pcmpistri $0x1a, %xmm1, %xmm0
1194 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1196 jbe LABEL(strcmp_exitz)
1199 jmp LABEL(loop_ashr_10_use)
1202 LABEL(nibble_ashr_10_use):
1204 movdqa -16(%rdi, %rdx), %xmm0
1206 pcmpistri $0x3a,%xmm0, %xmm0
1207 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1209 jae LABEL(nibble_ashr_exit_use)
1212 ja LABEL(nibble_ashr_10_restart_use)
1214 jmp LABEL(nibble_ashr_exit_use)
1217 * The following cases will be handled by ashr_11
1218 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1219 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1224 TOLOWER (%xmm1, %xmm2)
1225 pcmpeqb %xmm1, %xmm2
1227 pmovmskb %xmm2, %r9d
1231 jnz LABEL(less32bytes)
1232 movdqa (%rdi), %xmm3
1234 UPDATE_STRNCMP_COUNTER
1236 mov $16, %rcx /* index for loads */
1237 mov $11, %r9d /* byte position left over from less32bytes case */
1239 * Setup %r10 value allows us to detect crossing a page boundary.
1240 * When %r10 goes positive we have crossed a page boundary and
1241 * need to do a nibble.
1244 and $0xfff, %r10 /* offset into 4K page */
1245 sub $0x1000, %r10 /* subtract 4K pagesize */
1246 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1249 LABEL(loop_ashr_11_use):
1251 jg LABEL(nibble_ashr_11_use)
1253 LABEL(nibble_ashr_11_restart_use):
1254 movdqa (%rdi, %rdx), %xmm0
1255 palignr $11, -16(%rdi, %rdx), %xmm0
1256 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1257 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1259 movdqa (%rsi,%rdx), %xmm1
1260 TOLOWER (%xmm0, %xmm1)
1261 pcmpistri $0x1a, %xmm1, %xmm0
1264 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1266 jbe LABEL(strcmp_exitz)
1271 jg LABEL(nibble_ashr_11_use)
1273 movdqa (%rdi, %rdx), %xmm0
1274 palignr $11, -16(%rdi, %rdx), %xmm0
1275 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1276 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1278 movdqa (%rsi,%rdx), %xmm1
1279 TOLOWER (%xmm0, %xmm1)
1280 pcmpistri $0x1a, %xmm1, %xmm0
1283 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1285 jbe LABEL(strcmp_exitz)
1288 jmp LABEL(loop_ashr_11_use)
1291 LABEL(nibble_ashr_11_use):
1293 movdqa -16(%rdi, %rdx), %xmm0
1295 pcmpistri $0x3a,%xmm0, %xmm0
1296 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1298 jae LABEL(nibble_ashr_exit_use)
1301 ja LABEL(nibble_ashr_11_restart_use)
1303 jmp LABEL(nibble_ashr_exit_use)
1306 * The following cases will be handled by ashr_12
1307 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1308 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1313 TOLOWER (%xmm1, %xmm2)
1314 pcmpeqb %xmm1, %xmm2
1316 pmovmskb %xmm2, %r9d
1320 jnz LABEL(less32bytes)
1321 movdqa (%rdi), %xmm3
1323 UPDATE_STRNCMP_COUNTER
1325 mov $16, %rcx /* index for loads */
1326 mov $12, %r9d /* byte position left over from less32bytes case */
1328 * Setup %r10 value allows us to detect crossing a page boundary.
1329 * When %r10 goes positive we have crossed a page boundary and
1330 * need to do a nibble.
1333 and $0xfff, %r10 /* offset into 4K page */
1334 sub $0x1000, %r10 /* subtract 4K pagesize */
1335 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1338 LABEL(loop_ashr_12_use):
1340 jg LABEL(nibble_ashr_12_use)
1342 LABEL(nibble_ashr_12_restart_use):
1343 movdqa (%rdi, %rdx), %xmm0
1344 palignr $12, -16(%rdi, %rdx), %xmm0
1345 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1346 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1348 movdqa (%rsi,%rdx), %xmm1
1349 TOLOWER (%xmm0, %xmm1)
1350 pcmpistri $0x1a, %xmm1, %xmm0
1353 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1355 jbe LABEL(strcmp_exitz)
1360 jg LABEL(nibble_ashr_12_use)
1362 movdqa (%rdi, %rdx), %xmm0
1363 palignr $12, -16(%rdi, %rdx), %xmm0
1364 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1365 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1367 movdqa (%rsi,%rdx), %xmm1
1368 TOLOWER (%xmm0, %xmm1)
1369 pcmpistri $0x1a, %xmm1, %xmm0
1372 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1374 jbe LABEL(strcmp_exitz)
1377 jmp LABEL(loop_ashr_12_use)
1380 LABEL(nibble_ashr_12_use):
1382 movdqa -16(%rdi, %rdx), %xmm0
1384 pcmpistri $0x3a,%xmm0, %xmm0
1385 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1387 jae LABEL(nibble_ashr_exit_use)
1390 ja LABEL(nibble_ashr_12_restart_use)
1392 jmp LABEL(nibble_ashr_exit_use)
1395 * The following cases will be handled by ashr_13
1396 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1397 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1402 TOLOWER (%xmm1, %xmm2)
1403 pcmpeqb %xmm1, %xmm2
1405 pmovmskb %xmm2, %r9d
1409 jnz LABEL(less32bytes)
1410 movdqa (%rdi), %xmm3
1412 UPDATE_STRNCMP_COUNTER
1414 mov $16, %rcx /* index for loads */
1415 mov $13, %r9d /* byte position left over from less32bytes case */
1417 * Setup %r10 value allows us to detect crossing a page boundary.
1418 * When %r10 goes positive we have crossed a page boundary and
1419 * need to do a nibble.
1422 and $0xfff, %r10 /* offset into 4K page */
1423 sub $0x1000, %r10 /* subtract 4K pagesize */
1425 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1428 LABEL(loop_ashr_13_use):
1430 jg LABEL(nibble_ashr_13_use)
1432 LABEL(nibble_ashr_13_restart_use):
1433 movdqa (%rdi, %rdx), %xmm0
1434 palignr $13, -16(%rdi, %rdx), %xmm0
1435 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1436 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1438 movdqa (%rsi,%rdx), %xmm1
1439 TOLOWER (%xmm0, %xmm1)
1440 pcmpistri $0x1a, %xmm1, %xmm0
1443 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1445 jbe LABEL(strcmp_exitz)
1450 jg LABEL(nibble_ashr_13_use)
1452 movdqa (%rdi, %rdx), %xmm0
1453 palignr $13, -16(%rdi, %rdx), %xmm0
1454 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1455 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1457 movdqa (%rsi,%rdx), %xmm1
1458 TOLOWER (%xmm0, %xmm1)
1459 pcmpistri $0x1a, %xmm1, %xmm0
1462 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1464 jbe LABEL(strcmp_exitz)
1467 jmp LABEL(loop_ashr_13_use)
1470 LABEL(nibble_ashr_13_use):
1472 movdqa -16(%rdi, %rdx), %xmm0
1474 pcmpistri $0x3a,%xmm0, %xmm0
1475 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1477 jae LABEL(nibble_ashr_exit_use)
1480 ja LABEL(nibble_ashr_13_restart_use)
1482 jmp LABEL(nibble_ashr_exit_use)
1485 * The following cases will be handled by ashr_14
1486 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1487 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1492 TOLOWER (%xmm1, %xmm2)
1493 pcmpeqb %xmm1, %xmm2
1495 pmovmskb %xmm2, %r9d
1499 jnz LABEL(less32bytes)
1500 movdqa (%rdi), %xmm3
1502 UPDATE_STRNCMP_COUNTER
1504 mov $16, %rcx /* index for loads */
1505 mov $14, %r9d /* byte position left over from less32bytes case */
1507 * Setup %r10 value allows us to detect crossing a page boundary.
1508 * When %r10 goes positive we have crossed a page boundary and
1509 * need to do a nibble.
1512 and $0xfff, %r10 /* offset into 4K page */
1513 sub $0x1000, %r10 /* subtract 4K pagesize */
1515 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1518 LABEL(loop_ashr_14_use):
1520 jg LABEL(nibble_ashr_14_use)
1522 LABEL(nibble_ashr_14_restart_use):
1523 movdqa (%rdi, %rdx), %xmm0
1524 palignr $14, -16(%rdi, %rdx), %xmm0
1525 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1526 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1528 movdqa (%rsi,%rdx), %xmm1
1529 TOLOWER (%xmm0, %xmm1)
1530 pcmpistri $0x1a, %xmm1, %xmm0
1533 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1535 jbe LABEL(strcmp_exitz)
1540 jg LABEL(nibble_ashr_14_use)
1542 movdqa (%rdi, %rdx), %xmm0
1543 palignr $14, -16(%rdi, %rdx), %xmm0
1544 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1545 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1547 movdqa (%rsi,%rdx), %xmm1
1548 TOLOWER (%xmm0, %xmm1)
1549 pcmpistri $0x1a, %xmm1, %xmm0
1552 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1554 jbe LABEL(strcmp_exitz)
1557 jmp LABEL(loop_ashr_14_use)
1560 LABEL(nibble_ashr_14_use):
1562 movdqa -16(%rdi, %rdx), %xmm0
1564 pcmpistri $0x3a,%xmm0, %xmm0
1565 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1567 jae LABEL(nibble_ashr_exit_use)
1570 ja LABEL(nibble_ashr_14_restart_use)
1572 jmp LABEL(nibble_ashr_exit_use)
1575 * The following cases will be handled by ashr_15
1576 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1577 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1582 TOLOWER (%xmm1, %xmm2)
1583 pcmpeqb %xmm1, %xmm2
1585 pmovmskb %xmm2, %r9d
1589 jnz LABEL(less32bytes)
1591 movdqa (%rdi), %xmm3
1593 UPDATE_STRNCMP_COUNTER
1595 mov $16, %rcx /* index for loads */
1596 mov $15, %r9d /* byte position left over from less32bytes case */
1598 * Setup %r10 value allows us to detect crossing a page boundary.
1599 * When %r10 goes positive we have crossed a page boundary and
1600 * need to do a nibble.
1603 and $0xfff, %r10 /* offset into 4K page */
1605 sub $0x1000, %r10 /* subtract 4K pagesize */
1607 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1610 LABEL(loop_ashr_15_use):
1612 jg LABEL(nibble_ashr_15_use)
1614 LABEL(nibble_ashr_15_restart_use):
1615 movdqa (%rdi, %rdx), %xmm0
1616 palignr $15, -16(%rdi, %rdx), %xmm0
1617 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1618 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1620 movdqa (%rsi,%rdx), %xmm1
1621 TOLOWER (%xmm0, %xmm1)
1622 pcmpistri $0x1a, %xmm1, %xmm0
1625 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1627 jbe LABEL(strcmp_exitz)
1632 jg LABEL(nibble_ashr_15_use)
1634 movdqa (%rdi, %rdx), %xmm0
1635 palignr $15, -16(%rdi, %rdx), %xmm0
1636 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1637 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1639 movdqa (%rsi,%rdx), %xmm1
1640 TOLOWER (%xmm0, %xmm1)
1641 pcmpistri $0x1a, %xmm1, %xmm0
1644 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1646 jbe LABEL(strcmp_exitz)
1649 jmp LABEL(loop_ashr_15_use)
1652 LABEL(nibble_ashr_15_use):
1654 movdqa -16(%rdi, %rdx), %xmm0
1656 pcmpistri $0x3a,%xmm0, %xmm0
1657 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1659 jae LABEL(nibble_ashr_exit_use)
1662 ja LABEL(nibble_ashr_15_restart_use)
1664 LABEL(nibble_ashr_exit_use):
1665 # if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1666 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1668 movdqa (%rsi,%rdx), %xmm1
1669 TOLOWER (%xmm0, %xmm1)
1670 pcmpistri $0x1a, %xmm1, %xmm0
1674 jnc LABEL(strcmp_exitz)
1675 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1677 jbe LABEL(strcmp_exitz)
1680 lea -16(%rdi, %r9), %rdi
1681 movzbl (%rdi, %rdx), %eax
1682 movzbl (%rsi, %rdx), %edx
1687 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1688 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1689 movl (%rcx,%rdx,4), %edx
1690 movl (%rcx,%rax,4), %eax
1697 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1698 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1701 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1706 bsf %rdx, %rdx /* find and store bit index in %rdx */
1708 # if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1710 jbe LABEL(strcmp_exitz)
1712 movzbl (%rsi, %rdx), %ecx
1713 movzbl (%rdi, %rdx), %eax
1715 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1716 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1717 movl (%rdx,%rcx,4), %ecx
1718 movl (%rdx,%rax,4), %eax
1724 LABEL(strcmp_exitz):
1729 // XXX Same as code above
1734 # if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1735 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1736 movl (%rdx,%rcx,4), %ecx
1737 movl (%rdx,%rax,4), %eax
1743 .size STRCMP, .-STRCMP
1750 /* Put all SSE 4.2 functions together. */
1751 .section .rodata.SECTION,"a",@progbits
1753 LABEL(unaligned_table):
1754 .int LABEL(ashr_1) - LABEL(unaligned_table)
1755 .int LABEL(ashr_2) - LABEL(unaligned_table)
1756 .int LABEL(ashr_3) - LABEL(unaligned_table)
1757 .int LABEL(ashr_4) - LABEL(unaligned_table)
1758 .int LABEL(ashr_5) - LABEL(unaligned_table)
1759 .int LABEL(ashr_6) - LABEL(unaligned_table)
1760 .int LABEL(ashr_7) - LABEL(unaligned_table)
1761 .int LABEL(ashr_8) - LABEL(unaligned_table)
1762 .int LABEL(ashr_9) - LABEL(unaligned_table)
1763 .int LABEL(ashr_10) - LABEL(unaligned_table)
1764 .int LABEL(ashr_11) - LABEL(unaligned_table)
1765 .int LABEL(ashr_12) - LABEL(unaligned_table)
1766 .int LABEL(ashr_13) - LABEL(unaligned_table)
1767 .int LABEL(ashr_14) - LABEL(unaligned_table)
1768 .int LABEL(ashr_15) - LABEL(unaligned_table)
1769 .int LABEL(ashr_0) - LABEL(unaligned_table)