initial commit
[glibc.git] / sysdeps / x86_64 / multiarch / strcmp-evex.S
1 /* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <isa-level.h>
20
21 #if ISA_SHOULD_BUILD (4)
22
23 # define STRCMP_ISA _evex
24 # include "strcmp-naming.h"
25
26 # include <sysdep.h>
27 # if defined USE_AS_STRCASECMP_L
28 # include "locale-defines.h"
29 # endif
30
31 # ifndef STRCMP
32 # define STRCMP __strcmp_evex
33 # endif
34
35 # define PAGE_SIZE 4096
36
37 /* VEC_SIZE = Number of bytes in a ymm register. */
38 # define VEC_SIZE 32
39 # define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
40
41 # define VMOVU vmovdqu64
42 # define VMOVA vmovdqa64
43
44 # ifdef USE_AS_WCSCMP
45 # define TESTEQ subl $0xff,
46 /* Compare packed dwords. */
47 # define VPCMP vpcmpd
48 # define VPMINU vpminud
49 # define VPTESTM vptestmd
50 # define VPTESTNM vptestnmd
51 /* 1 dword char == 4 bytes. */
52 # define SIZE_OF_CHAR 4
53 # else
54 # define TESTEQ incl
55 /* Compare packed bytes. */
56 # define VPCMP vpcmpb
57 # define VPMINU vpminub
58 # define VPTESTM vptestmb
59 # define VPTESTNM vptestnmb
60 /* 1 byte char == 1 byte. */
61 # define SIZE_OF_CHAR 1
62 # endif
63
64 # ifdef USE_AS_STRNCMP
65 # define LOOP_REG r9d
66 # define LOOP_REG64 r9
67
68 # define OFFSET_REG8 r9b
69 # define OFFSET_REG r9d
70 # define OFFSET_REG64 r9
71 # else
72 # define LOOP_REG edx
73 # define LOOP_REG64 rdx
74
75 # define OFFSET_REG8 dl
76 # define OFFSET_REG edx
77 # define OFFSET_REG64 rdx
78 # endif
79
80 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
81 # define VEC_OFFSET 0
82 # else
83 # define VEC_OFFSET (-VEC_SIZE)
84 # endif
85
86 # define XMM0 xmm17
87 # define XMM1 xmm18
88
89 # define XMM10 xmm27
90 # define XMM11 xmm28
91 # define XMM12 xmm29
92 # define XMM13 xmm30
93 # define XMM14 xmm31
94
95
96 # define YMM0 ymm17
97 # define YMM1 ymm18
98 # define YMM2 ymm19
99 # define YMM3 ymm20
100 # define YMM4 ymm21
101 # define YMM5 ymm22
102 # define YMM6 ymm23
103 # define YMM7 ymm24
104 # define YMM8 ymm25
105 # define YMM9 ymm26
106 # define YMM10 ymm27
107 # define YMM11 ymm28
108 # define YMM12 ymm29
109 # define YMM13 ymm30
110 # define YMM14 ymm31
111
112 # ifdef USE_AS_STRCASECMP_L
113 # define BYTE_LOOP_REG OFFSET_REG
114 # else
115 # define BYTE_LOOP_REG ecx
116 # endif
117
118 # ifdef USE_AS_STRCASECMP_L
119 # ifdef USE_AS_STRNCMP
120 # define LOCALE_REG rcx
121 # define LOCALE_REG_LP RCX_LP
122 # else
123 # define LOCALE_REG rdx
124 # define LOCALE_REG_LP RDX_LP
125 # endif
126 # endif
127
128 # define LCASE_MIN_YMM %YMM12
129 # define LCASE_MAX_YMM %YMM13
130 # define CASE_ADD_YMM %YMM14
131
132 # define LCASE_MIN_XMM %XMM12
133 # define LCASE_MAX_XMM %XMM13
134 # define CASE_ADD_XMM %XMM14
135
136 /* NB: wcsncmp uses r11 but strcasecmp is never used in
137 conjunction with wcscmp. */
138 # define TOLOWER_BASE %r11
139
140 # ifdef USE_AS_STRCASECMP_L
141 # define _REG(x, y) x ## y
142 # define REG(x, y) _REG(x, y)
143 # define TOLOWER(reg1, reg2, ext) \
144 vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \
145 vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \
146 vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \
147 vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \
148 vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \
149 vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6}
150
151 # define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
152 # define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM)
153 # define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM)
154
155 # define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \
156 TOLOWER (s1_reg, s2_reg, ext); \
157 VPCMP $0, s1_reg, s2_reg, reg_out
158
159 # define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \
160 VMOVU s2_mem, s2_reg; \
161 CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
162
163 # define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
164 # define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
165
166 # define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
167 # define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
168
169 # else
170 # define TOLOWER_gpr(...)
171 # define TOLOWER_YMM(...)
172 # define TOLOWER_XMM(...)
173
174 # define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \
175 VPCMP $0, s2_reg, s1_reg, reg_out
176
177 # define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
178
179 # define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \
180 VPCMP $0, s2_mem, s1_reg, reg_out
181
182 # define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
183 # endif
184
185 /* Warning!
186 wcscmp/wcsncmp have to use SIGNED comparison for elements.
187 strcmp/strncmp have to use UNSIGNED comparison for elements.
188 */
189
190 /* The main idea of the string comparison (byte or dword) using 256-bit
191 EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
192 latter can be on either packed bytes or dwords depending on
193 USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
194 matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
195 KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
196 are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
197 instructions. Main loop (away from from page boundary) compares 4
198 vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
199 bytes) on each loop.
200
201 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
202 is the same as strcmp, except that an a maximum offset is tracked. If
203 the maximum offset is reached before a difference is found, zero is
204 returned. */
205
206 .section .text.evex, "ax", @progbits
207 .align 16
208 .type STRCMP, @function
209 .globl STRCMP
210 # ifdef USE_AS_STRCASECMP_L
211 ENTRY (STRCASECMP)
212 movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
213 mov %fs:(%rax), %LOCALE_REG_LP
214
215 /* Either 1 or 5 bytes (dependeing if CET is enabled). */
216 .p2align 4
217 END (STRCASECMP)
218 /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
219 # endif
220
221 .p2align 4
222 STRCMP:
223 cfi_startproc
224 _CET_ENDBR
225 CALL_MCOUNT
226
227 # if defined USE_AS_STRCASECMP_L
228 /* We have to fall back on the C implementation for locales with
229 encodings not matching ASCII for single bytes. */
230 # if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
231 mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
232 # else
233 mov (%LOCALE_REG), %RAX_LP
234 # endif
235 testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
236 jne STRCASECMP_L_NONASCII
237 leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
238 # endif
239
240 # ifdef USE_AS_STRNCMP
241 /* Don't overwrite LOCALE_REG (rcx) until we have pass
242 L(one_or_less). Otherwise we might use the wrong locale in
243 the OVERFLOW_STRCMP (strcasecmp_l). */
244 # ifdef __ILP32__
245 /* Clear the upper 32 bits. */
246 movl %edx, %edx
247 # endif
248 cmp $1, %RDX_LP
249 /* Signed comparison intentional. We use this branch to also
250 test cases where length >= 2^63. These very large sizes can be
251 handled with strcmp as there is no way for that length to
252 actually bound the buffer. */
253 jle L(one_or_less)
254 # endif
255
256 # if defined USE_AS_STRCASECMP_L
257 .section .rodata.cst32, "aM", @progbits, 32
258 .align 32
259 L(lcase_min):
260 .quad 0x4141414141414141
261 .quad 0x4141414141414141
262 .quad 0x4141414141414141
263 .quad 0x4141414141414141
264 L(lcase_max):
265 .quad 0x1a1a1a1a1a1a1a1a
266 .quad 0x1a1a1a1a1a1a1a1a
267 .quad 0x1a1a1a1a1a1a1a1a
268 .quad 0x1a1a1a1a1a1a1a1a
269 L(case_add):
270 .quad 0x2020202020202020
271 .quad 0x2020202020202020
272 .quad 0x2020202020202020
273 .quad 0x2020202020202020
274 .previous
275
276 vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
277 vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
278 vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
279 # endif
280
281 movl %edi, %eax
282 orl %esi, %eax
283 /* Shift out the bits irrelivant to page boundary ([63:12]). */
284 sall $20, %eax
285 /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
286 cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
287 ja L(page_cross)
288
289 L(no_page_cross):
290 /* Safe to compare 4x vectors. */
291 VMOVU (%rdi), %YMM0
292 VPTESTM %YMM0, %YMM0, %k2
293 /* Each bit cleared in K1 represents a mismatch or a null CHAR
294 in YMM0 and 32 bytes at (%rsi). */
295 CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
296 kmovd %k1, %ecx
297 # ifdef USE_AS_STRNCMP
298 cmpq $CHAR_PER_VEC, %rdx
299 jbe L(vec_0_test_len)
300 # endif
301
302 /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
303 wcscmp/wcsncmp. */
304
305 /* All 1s represents all equals. TESTEQ will overflow to zero in
306 all equals case. Otherwise 1s will carry until position of first
307 mismatch. */
308 TESTEQ %ecx
309 jz L(more_3x_vec)
310
311 .p2align 4,, 4
312 L(return_vec_0):
313 tzcntl %ecx, %ecx
314 # ifdef USE_AS_WCSCMP
315 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
316 xorl %eax, %eax
317 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
318 je L(ret0)
319 setl %al
320 negl %eax
321 orl $1, %eax
322 # else
323 movzbl (%rdi, %rcx), %eax
324 movzbl (%rsi, %rcx), %ecx
325 TOLOWER_gpr (%rax, %eax)
326 TOLOWER_gpr (%rcx, %ecx)
327 subl %ecx, %eax
328 # endif
329 L(ret0):
330 ret
331
332 # ifdef USE_AS_STRNCMP
333 .p2align 4,, 4
334 L(vec_0_test_len):
335 notl %ecx
336 bzhil %edx, %ecx, %eax
337 jnz L(return_vec_0)
338 /* Align if will cross fetch block. */
339 .p2align 4,, 2
340 L(ret_zero):
341 xorl %eax, %eax
342 ret
343
344 .p2align 4,, 5
345 L(one_or_less):
346 # ifdef USE_AS_STRCASECMP_L
347 /* Set locale argument for strcasecmp. */
348 movq %LOCALE_REG, %rdx
349 # endif
350 jb L(ret_zero)
351 /* 'nbe' covers the case where length is negative (large
352 unsigned). */
353 jnbe OVERFLOW_STRCMP
354 # ifdef USE_AS_WCSCMP
355 movl (%rdi), %edx
356 xorl %eax, %eax
357 cmpl (%rsi), %edx
358 je L(ret1)
359 setl %al
360 negl %eax
361 orl $1, %eax
362 # else
363 movzbl (%rdi), %eax
364 movzbl (%rsi), %ecx
365 TOLOWER_gpr (%rax, %eax)
366 TOLOWER_gpr (%rcx, %ecx)
367 subl %ecx, %eax
368 # endif
369 L(ret1):
370 ret
371 # endif
372
373 .p2align 4,, 10
374 L(return_vec_1):
375 tzcntl %ecx, %ecx
376 # ifdef USE_AS_STRNCMP
377 /* rdx must be > CHAR_PER_VEC so its safe to subtract without
378 worrying about underflow. */
379 addq $-CHAR_PER_VEC, %rdx
380 cmpq %rcx, %rdx
381 jbe L(ret_zero)
382 # endif
383 # ifdef USE_AS_WCSCMP
384 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
385 xorl %eax, %eax
386 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
387 je L(ret2)
388 setl %al
389 negl %eax
390 orl $1, %eax
391 # else
392 movzbl VEC_SIZE(%rdi, %rcx), %eax
393 movzbl VEC_SIZE(%rsi, %rcx), %ecx
394 TOLOWER_gpr (%rax, %eax)
395 TOLOWER_gpr (%rcx, %ecx)
396 subl %ecx, %eax
397 # endif
398 L(ret2):
399 ret
400
401 .p2align 4,, 10
402 # ifdef USE_AS_STRNCMP
403 L(return_vec_3):
404 # if CHAR_PER_VEC <= 16
405 sall $CHAR_PER_VEC, %ecx
406 # else
407 salq $CHAR_PER_VEC, %rcx
408 # endif
409 # endif
410 L(return_vec_2):
411 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
412 tzcntl %ecx, %ecx
413 # else
414 tzcntq %rcx, %rcx
415 # endif
416
417 # ifdef USE_AS_STRNCMP
418 cmpq %rcx, %rdx
419 jbe L(ret_zero)
420 # endif
421
422 # ifdef USE_AS_WCSCMP
423 movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
424 xorl %eax, %eax
425 cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
426 je L(ret3)
427 setl %al
428 negl %eax
429 orl $1, %eax
430 # else
431 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
432 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
433 TOLOWER_gpr (%rax, %eax)
434 TOLOWER_gpr (%rcx, %ecx)
435 subl %ecx, %eax
436 # endif
437 L(ret3):
438 ret
439
440 # ifndef USE_AS_STRNCMP
441 .p2align 4,, 10
442 L(return_vec_3):
443 tzcntl %ecx, %ecx
444 # ifdef USE_AS_WCSCMP
445 movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
446 xorl %eax, %eax
447 cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
448 je L(ret4)
449 setl %al
450 negl %eax
451 orl $1, %eax
452 # else
453 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
454 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
455 TOLOWER_gpr (%rax, %eax)
456 TOLOWER_gpr (%rcx, %ecx)
457 subl %ecx, %eax
458 # endif
459 L(ret4):
460 ret
461 # endif
462
463 /* 32 byte align here ensures the main loop is ideally aligned
464 for DSB. */
465 .p2align 5
466 L(more_3x_vec):
467 /* Safe to compare 4x vectors. */
468 VMOVU (VEC_SIZE)(%rdi), %YMM0
469 VPTESTM %YMM0, %YMM0, %k2
470 CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
471 kmovd %k1, %ecx
472 TESTEQ %ecx
473 jnz L(return_vec_1)
474
475 # ifdef USE_AS_STRNCMP
476 subq $(CHAR_PER_VEC * 2), %rdx
477 jbe L(ret_zero)
478 # endif
479
480 VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
481 VPTESTM %YMM0, %YMM0, %k2
482 CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
483 kmovd %k1, %ecx
484 TESTEQ %ecx
485 jnz L(return_vec_2)
486
487 VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
488 VPTESTM %YMM0, %YMM0, %k2
489 CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
490 kmovd %k1, %ecx
491 TESTEQ %ecx
492 jnz L(return_vec_3)
493
494 # ifdef USE_AS_STRNCMP
495 cmpq $(CHAR_PER_VEC * 2), %rdx
496 jbe L(ret_zero)
497 # endif
498
499
500 # ifdef USE_AS_WCSCMP
501 /* any non-zero positive value that doesn't inference with 0x1.
502 */
503 movl $2, %r8d
504
505 # else
506 xorl %r8d, %r8d
507 # endif
508
509 /* The prepare labels are various entry points from the page
510 cross logic. */
511 L(prepare_loop):
512
513 # ifdef USE_AS_STRNCMP
514 # ifdef USE_AS_WCSCMP
515 L(prepare_loop_no_len):
516 movl %edi, %ecx
517 andl $(VEC_SIZE * 4 - 1), %ecx
518 shrl $2, %ecx
519 leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
520 # else
521 /* Store N + (VEC_SIZE * 4) and place check at the begining of
522 the loop. */
523 leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
524 L(prepare_loop_no_len):
525 # endif
526 # else
527 L(prepare_loop_no_len):
528 # endif
529
530 /* Align s1 and adjust s2 accordingly. */
531 subq %rdi, %rsi
532 andq $-(VEC_SIZE * 4), %rdi
533 L(prepare_loop_readj):
534 addq %rdi, %rsi
535 # if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
536 subq %rdi, %rdx
537 # endif
538
539 L(prepare_loop_aligned):
540 /* eax stores distance from rsi to next page cross. These cases
541 need to be handled specially as the 4x loop could potentially
542 read memory past the length of s1 or s2 and across a page
543 boundary. */
544 movl $-(VEC_SIZE * 4), %eax
545 subl %esi, %eax
546 andl $(PAGE_SIZE - 1), %eax
547
548
549 /* Loop 4x comparisons at a time. */
550 .p2align 4
551 L(loop):
552
553 /* End condition for strncmp. */
554 # ifdef USE_AS_STRNCMP
555 subq $(CHAR_PER_VEC * 4), %rdx
556 jbe L(ret_zero)
557 # endif
558
559 subq $-(VEC_SIZE * 4), %rdi
560 subq $-(VEC_SIZE * 4), %rsi
561
562 /* Check if rsi loads will cross a page boundary. */
563 addl $-(VEC_SIZE * 4), %eax
564 jnb L(page_cross_during_loop)
565
566 /* Loop entry after handling page cross during loop. */
567 L(loop_skip_page_cross_check):
568 VMOVA (VEC_SIZE * 0)(%rdi), %YMM0
569 VMOVA (VEC_SIZE * 1)(%rdi), %YMM2
570 VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
571 VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
572
573 VPMINU %YMM0, %YMM2, %YMM8
574 VPMINU %YMM4, %YMM6, %YMM9
575
576 /* A zero CHAR in YMM9 means that there is a null CHAR. */
577 VPMINU %YMM8, %YMM9, %YMM9
578
579 /* Each bit set in K1 represents a non-null CHAR in YMM9. */
580 VPTESTM %YMM9, %YMM9, %k1
581 # ifndef USE_AS_STRCASECMP_L
582 vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
583 vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
584 vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
585 /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
586 oring with YMM1. Result is stored in YMM6. */
587 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
588 # else
589 VMOVU (VEC_SIZE * 0)(%rsi), %YMM1
590 TOLOWER_YMM (%YMM0, %YMM1)
591 VMOVU (VEC_SIZE * 1)(%rsi), %YMM3
592 TOLOWER_YMM (%YMM2, %YMM3)
593 VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
594 TOLOWER_YMM (%YMM4, %YMM5)
595 VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
596 TOLOWER_YMM (%YMM6, %YMM7)
597 vpxorq %YMM0, %YMM1, %YMM1
598 vpxorq %YMM2, %YMM3, %YMM3
599 vpxorq %YMM4, %YMM5, %YMM5
600 vpternlogd $0xde, %YMM7, %YMM1, %YMM6
601 # endif
602 /* Or together YMM3, YMM5, and YMM6. */
603 vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
604
605
606 /* A non-zero CHAR in YMM6 represents a mismatch. */
607 VPTESTNM %YMM6, %YMM6, %k0{%k1}
608 kmovd %k0, %LOOP_REG
609
610 TESTEQ %LOOP_REG
611 jz L(loop)
612
613
614 /* Find which VEC has the mismatch of end of string. */
615 VPTESTM %YMM0, %YMM0, %k1
616 VPTESTNM %YMM1, %YMM1, %k0{%k1}
617 kmovd %k0, %ecx
618 TESTEQ %ecx
619 jnz L(return_vec_0_end)
620
621 VPTESTM %YMM2, %YMM2, %k1
622 VPTESTNM %YMM3, %YMM3, %k0{%k1}
623 kmovd %k0, %ecx
624 TESTEQ %ecx
625 jnz L(return_vec_1_end)
626
627
628 /* Handle VEC 2 and 3 without branches. */
629 L(return_vec_2_3_end):
630 # ifdef USE_AS_STRNCMP
631 subq $(CHAR_PER_VEC * 2), %rdx
632 jbe L(ret_zero_end)
633 # endif
634
635 VPTESTM %YMM4, %YMM4, %k1
636 VPTESTNM %YMM5, %YMM5, %k0{%k1}
637 kmovd %k0, %ecx
638 TESTEQ %ecx
639 # if CHAR_PER_VEC <= 16
640 sall $CHAR_PER_VEC, %LOOP_REG
641 orl %ecx, %LOOP_REG
642 # else
643 salq $CHAR_PER_VEC, %LOOP_REG64
644 orq %rcx, %LOOP_REG64
645 # endif
646 L(return_vec_3_end):
647 /* LOOP_REG contains matches for null/mismatch from the loop. If
648 VEC 0,1,and 2 all have no null and no mismatches then mismatch
649 must entirely be from VEC 3 which is fully represented by
650 LOOP_REG. */
651 # if CHAR_PER_VEC <= 16
652 tzcntl %LOOP_REG, %LOOP_REG
653 # else
654 tzcntq %LOOP_REG64, %LOOP_REG64
655 # endif
656 # ifdef USE_AS_STRNCMP
657 cmpq %LOOP_REG64, %rdx
658 jbe L(ret_zero_end)
659 # endif
660
661 # ifdef USE_AS_WCSCMP
662 movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
663 xorl %eax, %eax
664 cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
665 je L(ret5)
666 setl %al
667 negl %eax
668 xorl %r8d, %eax
669 # else
670 movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
671 movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
672 TOLOWER_gpr (%rax, %eax)
673 TOLOWER_gpr (%rcx, %ecx)
674 subl %ecx, %eax
675 xorl %r8d, %eax
676 subl %r8d, %eax
677 # endif
678 L(ret5):
679 ret
680
681 # ifdef USE_AS_STRNCMP
682 .p2align 4,, 2
683 L(ret_zero_end):
684 xorl %eax, %eax
685 ret
686 # endif
687
688
689 /* The L(return_vec_N_end) differ from L(return_vec_N) in that
690 they use the value of `r8` to negate the return value. This is
691 because the page cross logic can swap `rdi` and `rsi`. */
692 .p2align 4,, 10
693 # ifdef USE_AS_STRNCMP
694 L(return_vec_1_end):
695 # if CHAR_PER_VEC <= 16
696 sall $CHAR_PER_VEC, %ecx
697 # else
698 salq $CHAR_PER_VEC, %rcx
699 # endif
700 # endif
701 L(return_vec_0_end):
702 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
703 tzcntl %ecx, %ecx
704 # else
705 tzcntq %rcx, %rcx
706 # endif
707
708 # ifdef USE_AS_STRNCMP
709 cmpq %rcx, %rdx
710 jbe L(ret_zero_end)
711 # endif
712
713 # ifdef USE_AS_WCSCMP
714 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
715 xorl %eax, %eax
716 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
717 je L(ret6)
718 setl %al
719 negl %eax
720 /* This is the non-zero case for `eax` so just xorl with `r8d`
721 flip is `rdi` and `rsi` where swapped. */
722 xorl %r8d, %eax
723 # else
724 movzbl (%rdi, %rcx), %eax
725 movzbl (%rsi, %rcx), %ecx
726 TOLOWER_gpr (%rax, %eax)
727 TOLOWER_gpr (%rcx, %ecx)
728 subl %ecx, %eax
729 /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
730 logic. Subtract `r8d` after xor for zero case. */
731 xorl %r8d, %eax
732 subl %r8d, %eax
733 # endif
734 L(ret6):
735 ret
736
737 # ifndef USE_AS_STRNCMP
738 .p2align 4,, 10
739 L(return_vec_1_end):
740 tzcntl %ecx, %ecx
741 # ifdef USE_AS_WCSCMP
742 movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
743 xorl %eax, %eax
744 cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
745 je L(ret7)
746 setl %al
747 negl %eax
748 xorl %r8d, %eax
749 # else
750 movzbl VEC_SIZE(%rdi, %rcx), %eax
751 movzbl VEC_SIZE(%rsi, %rcx), %ecx
752 TOLOWER_gpr (%rax, %eax)
753 TOLOWER_gpr (%rcx, %ecx)
754 subl %ecx, %eax
755 xorl %r8d, %eax
756 subl %r8d, %eax
757 # endif
758 L(ret7):
759 ret
760 # endif
761
762
763 /* Page cross in rsi in next 4x VEC. */
764
765 /* TODO: Improve logic here. */
766 .p2align 4,, 10
767 L(page_cross_during_loop):
768 /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */
769
770 /* Optimistically rsi and rdi and both aligned in which case we
771 don't need any logic here. */
772 cmpl $-(VEC_SIZE * 4), %eax
773 /* Don't adjust eax before jumping back to loop and we will
774 never hit page cross case again. */
775 je L(loop_skip_page_cross_check)
776
777 /* Check if we can safely load a VEC. */
778 cmpl $-(VEC_SIZE * 3), %eax
779 jle L(less_1x_vec_till_page_cross)
780
781 VMOVA (%rdi), %YMM0
782 VPTESTM %YMM0, %YMM0, %k2
783 CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
784 kmovd %k1, %ecx
785 TESTEQ %ecx
786 jnz L(return_vec_0_end)
787
788 /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */
789 cmpl $-(VEC_SIZE * 2), %eax
790 jg L(more_2x_vec_till_page_cross)
791
792 .p2align 4,, 4
793 L(less_1x_vec_till_page_cross):
794 subl $-(VEC_SIZE * 4), %eax
795 /* Guranteed safe to read from rdi - VEC_SIZE here. The only
796 concerning case is first iteration if incoming s1 was near start
797 of a page and s2 near end. If s1 was near the start of the page
798 we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe
799 to read back -VEC_SIZE. If rdi is truly at the start of a page
800 here, it means the previous page (rdi - VEC_SIZE) has already
801 been loaded earlier so must be valid. */
802 VMOVU -VEC_SIZE(%rdi, %rax), %YMM0
803 VPTESTM %YMM0, %YMM0, %k2
804 CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
805 /* Mask of potentially valid bits. The lower bits can be out of
806 range comparisons (but safe regarding page crosses). */
807
808 # ifdef USE_AS_WCSCMP
809 movl $-1, %r10d
810 movl %esi, %ecx
811 andl $(VEC_SIZE - 1), %ecx
812 shrl $2, %ecx
813 shlxl %ecx, %r10d, %ecx
814 movzbl %cl, %r10d
815 # else
816 movl $-1, %ecx
817 shlxl %esi, %ecx, %r10d
818 # endif
819
820 kmovd %k1, %ecx
821 notl %ecx
822
823
824 # ifdef USE_AS_STRNCMP
825 # ifdef USE_AS_WCSCMP
826 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
827 safe. */
828 movl %eax, %r11d
829 shrl $2, %r11d
830 cmpq %r11, %rdx
831 # else
832 cmpq %rax, %rdx
833 # endif
834 jbe L(return_page_cross_end_check)
835 # endif
836 movl %eax, %OFFSET_REG
837
838 /* Readjust eax before potentially returning to the loop. */
839 addl $(PAGE_SIZE - VEC_SIZE * 4), %eax
840
841 andl %r10d, %ecx
842 jz L(loop_skip_page_cross_check)
843
844 .p2align 4,, 3
845 L(return_page_cross_end):
846 tzcntl %ecx, %ecx
847
848 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
849 leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
850 L(return_page_cross_cmp_mem):
851 # else
852 addl %OFFSET_REG, %ecx
853 # endif
854 # ifdef USE_AS_WCSCMP
855 movl VEC_OFFSET(%rdi, %rcx), %edx
856 xorl %eax, %eax
857 cmpl VEC_OFFSET(%rsi, %rcx), %edx
858 je L(ret8)
859 setl %al
860 negl %eax
861 xorl %r8d, %eax
862 # else
863 movzbl VEC_OFFSET(%rdi, %rcx), %eax
864 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
865 TOLOWER_gpr (%rax, %eax)
866 TOLOWER_gpr (%rcx, %ecx)
867 subl %ecx, %eax
868 xorl %r8d, %eax
869 subl %r8d, %eax
870 # endif
871 L(ret8):
872 ret
873
874 # ifdef USE_AS_STRNCMP
875 .p2align 4,, 10
876 L(return_page_cross_end_check):
877 andl %r10d, %ecx
878 tzcntl %ecx, %ecx
879 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
880 # ifdef USE_AS_WCSCMP
881 sall $2, %edx
882 # endif
883 cmpl %ecx, %edx
884 ja L(return_page_cross_cmp_mem)
885 xorl %eax, %eax
886 ret
887 # endif
888
889
890 .p2align 4,, 10
891 L(more_2x_vec_till_page_cross):
892 /* If more 2x vec till cross we will complete a full loop
893 iteration here. */
894
895 VMOVA VEC_SIZE(%rdi), %YMM0
896 VPTESTM %YMM0, %YMM0, %k2
897 CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
898 kmovd %k1, %ecx
899 TESTEQ %ecx
900 jnz L(return_vec_1_end)
901
902 # ifdef USE_AS_STRNCMP
903 cmpq $(CHAR_PER_VEC * 2), %rdx
904 jbe L(ret_zero_in_loop_page_cross)
905 # endif
906
907 subl $-(VEC_SIZE * 4), %eax
908
909 /* Safe to include comparisons from lower bytes. */
910 VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
911 VPTESTM %YMM0, %YMM0, %k2
912 CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
913 kmovd %k1, %ecx
914 TESTEQ %ecx
915 jnz L(return_vec_page_cross_0)
916
917 VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
918 VPTESTM %YMM0, %YMM0, %k2
919 CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
920 kmovd %k1, %ecx
921 TESTEQ %ecx
922 jnz L(return_vec_page_cross_1)
923
924 # ifdef USE_AS_STRNCMP
925 /* Must check length here as length might proclude reading next
926 page. */
927 # ifdef USE_AS_WCSCMP
928 /* NB: strcasecmp not used with WCSCMP so this access to r11 is
929 safe. */
930 movl %eax, %r11d
931 shrl $2, %r11d
932 cmpq %r11, %rdx
933 # else
934 cmpq %rax, %rdx
935 # endif
936 jbe L(ret_zero_in_loop_page_cross)
937 # endif
938
939 /* Finish the loop. */
940 VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
941 VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
942 VPMINU %YMM4, %YMM6, %YMM9
943 VPTESTM %YMM9, %YMM9, %k1
944 # ifndef USE_AS_STRCASECMP_L
945 vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
946 /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */
947 vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
948 # else
949 VMOVU (VEC_SIZE * 2)(%rsi), %YMM5
950 TOLOWER_YMM (%YMM4, %YMM5)
951 VMOVU (VEC_SIZE * 3)(%rsi), %YMM7
952 TOLOWER_YMM (%YMM6, %YMM7)
953 vpxorq %YMM4, %YMM5, %YMM5
954 vpternlogd $0xde, %YMM7, %YMM5, %YMM6
955 # endif
956 VPTESTNM %YMM6, %YMM6, %k0{%k1}
957 kmovd %k0, %LOOP_REG
958 TESTEQ %LOOP_REG
959 jnz L(return_vec_2_3_end)
960
961 /* Best for code size to include ucond-jmp here. Would be faster
962 if this case is hot to duplicate the L(return_vec_2_3_end) code
963 as fall-through and have jump back to loop on mismatch
964 comparison. */
965 subq $-(VEC_SIZE * 4), %rdi
966 subq $-(VEC_SIZE * 4), %rsi
967 addl $(PAGE_SIZE - VEC_SIZE * 8), %eax
968 # ifdef USE_AS_STRNCMP
969 subq $(CHAR_PER_VEC * 4), %rdx
970 ja L(loop_skip_page_cross_check)
971 L(ret_zero_in_loop_page_cross):
972 xorl %eax, %eax
973 ret
974 # else
975 jmp L(loop_skip_page_cross_check)
976 # endif
977
978
979 .p2align 4,, 10
980 L(return_vec_page_cross_0):
981 addl $-VEC_SIZE, %eax
982 L(return_vec_page_cross_1):
983 tzcntl %ecx, %ecx
984 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
985 leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
986 # ifdef USE_AS_STRNCMP
987 # ifdef USE_AS_WCSCMP
988 /* Must divide ecx instead of multiply rdx due to overflow. */
989 movl %ecx, %eax
990 shrl $2, %eax
991 cmpq %rax, %rdx
992 # else
993 cmpq %rcx, %rdx
994 # endif
995 jbe L(ret_zero_in_loop_page_cross)
996 # endif
997 # else
998 addl %eax, %ecx
999 # endif
1000
1001 # ifdef USE_AS_WCSCMP
1002 movl VEC_OFFSET(%rdi, %rcx), %edx
1003 xorl %eax, %eax
1004 cmpl VEC_OFFSET(%rsi, %rcx), %edx
1005 je L(ret9)
1006 setl %al
1007 negl %eax
1008 xorl %r8d, %eax
1009 # else
1010 movzbl VEC_OFFSET(%rdi, %rcx), %eax
1011 movzbl VEC_OFFSET(%rsi, %rcx), %ecx
1012 TOLOWER_gpr (%rax, %eax)
1013 TOLOWER_gpr (%rcx, %ecx)
1014 subl %ecx, %eax
1015 xorl %r8d, %eax
1016 subl %r8d, %eax
1017 # endif
1018 L(ret9):
1019 ret
1020
1021
1022 .p2align 4,, 10
1023 L(page_cross):
1024 # ifndef USE_AS_STRNCMP
1025 /* If both are VEC aligned we don't need any special logic here.
1026 Only valid for strcmp where stop condition is guranteed to be
1027 reachable by just reading memory. */
1028 testl $((VEC_SIZE - 1) << 20), %eax
1029 jz L(no_page_cross)
1030 # endif
1031
1032 movl %edi, %eax
1033 movl %esi, %ecx
1034 andl $(PAGE_SIZE - 1), %eax
1035 andl $(PAGE_SIZE - 1), %ecx
1036
1037 xorl %OFFSET_REG, %OFFSET_REG
1038
1039 /* Check which is closer to page cross, s1 or s2. */
1040 cmpl %eax, %ecx
1041 jg L(page_cross_s2)
1042
1043 /* The previous page cross check has false positives. Check for
1044 true positive as page cross logic is very expensive. */
1045 subl $(PAGE_SIZE - VEC_SIZE * 4), %eax
1046 jbe L(no_page_cross)
1047
1048
1049 /* Set r8 to not interfere with normal return value (rdi and rsi
1050 did not swap). */
1051 # ifdef USE_AS_WCSCMP
1052 /* any non-zero positive value that doesn't inference with 0x1.
1053 */
1054 movl $2, %r8d
1055 # else
1056 xorl %r8d, %r8d
1057 # endif
1058
1059 /* Check if less than 1x VEC till page cross. */
1060 subl $(VEC_SIZE * 3), %eax
1061 jg L(less_1x_vec_till_page)
1062
1063
1064 /* If more than 1x VEC till page cross, loop throuh safely
1065 loadable memory until within 1x VEC of page cross. */
1066 .p2align 4,, 8
1067 L(page_cross_loop):
1068 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
1069 VPTESTM %YMM0, %YMM0, %k2
1070 CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
1071 kmovd %k1, %ecx
1072 TESTEQ %ecx
1073 jnz L(check_ret_vec_page_cross)
1074 addl $CHAR_PER_VEC, %OFFSET_REG
1075 # ifdef USE_AS_STRNCMP
1076 cmpq %OFFSET_REG64, %rdx
1077 jbe L(ret_zero_page_cross)
1078 # endif
1079 addl $VEC_SIZE, %eax
1080 jl L(page_cross_loop)
1081
1082 # ifdef USE_AS_WCSCMP
1083 shrl $2, %eax
1084 # endif
1085
1086
1087 subl %eax, %OFFSET_REG
1088 /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
1089 to not cross page so is safe to load. Since we have already
1090 loaded at least 1 VEC from rsi it is also guranteed to be safe.
1091 */
1092 VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
1093 VPTESTM %YMM0, %YMM0, %k2
1094 CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
1095
1096 kmovd %k1, %ecx
1097 # ifdef USE_AS_STRNCMP
1098 leal CHAR_PER_VEC(%OFFSET_REG64), %eax
1099 cmpq %rax, %rdx
1100 jbe L(check_ret_vec_page_cross2)
1101 # ifdef USE_AS_WCSCMP
1102 addq $-(CHAR_PER_VEC * 2), %rdx
1103 # else
1104 addq %rdi, %rdx
1105 # endif
1106 # endif
1107 TESTEQ %ecx
1108 jz L(prepare_loop_no_len)
1109
1110 .p2align 4,, 4
1111 L(ret_vec_page_cross):
1112 # ifndef USE_AS_STRNCMP
1113 L(check_ret_vec_page_cross):
1114 # endif
1115 tzcntl %ecx, %ecx
1116 addl %OFFSET_REG, %ecx
1117 L(ret_vec_page_cross_cont):
1118 # ifdef USE_AS_WCSCMP
1119 movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
1120 xorl %eax, %eax
1121 cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
1122 je L(ret12)
1123 setl %al
1124 negl %eax
1125 xorl %r8d, %eax
1126 # else
1127 movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax
1128 movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx
1129 TOLOWER_gpr (%rax, %eax)
1130 TOLOWER_gpr (%rcx, %ecx)
1131 subl %ecx, %eax
1132 xorl %r8d, %eax
1133 subl %r8d, %eax
1134 # endif
1135 L(ret12):
1136 ret
1137
1138
1139 # ifdef USE_AS_STRNCMP
1140 .p2align 4,, 10
1141 L(check_ret_vec_page_cross2):
1142 TESTEQ %ecx
1143 L(check_ret_vec_page_cross):
1144 tzcntl %ecx, %ecx
1145 addl %OFFSET_REG, %ecx
1146 cmpq %rcx, %rdx
1147 ja L(ret_vec_page_cross_cont)
1148 .p2align 4,, 2
1149 L(ret_zero_page_cross):
1150 xorl %eax, %eax
1151 ret
1152 # endif
1153
1154 .p2align 4,, 4
1155 L(page_cross_s2):
1156 /* Ensure this is a true page cross. */
1157 subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx
1158 jbe L(no_page_cross)
1159
1160
1161 movl %ecx, %eax
1162 movq %rdi, %rcx
1163 movq %rsi, %rdi
1164 movq %rcx, %rsi
1165
1166 /* set r8 to negate return value as rdi and rsi swapped. */
1167 # ifdef USE_AS_WCSCMP
1168 movl $-4, %r8d
1169 # else
1170 movl $-1, %r8d
1171 # endif
1172 xorl %OFFSET_REG, %OFFSET_REG
1173
1174 /* Check if more than 1x VEC till page cross. */
1175 subl $(VEC_SIZE * 3), %eax
1176 jle L(page_cross_loop)
1177
1178 .p2align 4,, 6
1179 L(less_1x_vec_till_page):
1180 # ifdef USE_AS_WCSCMP
1181 shrl $2, %eax
1182 # endif
1183 /* Find largest load size we can use. */
1184 cmpl $(16 / SIZE_OF_CHAR), %eax
1185 ja L(less_16_till_page)
1186
1187 /* Use 16 byte comparison. */
1188 vmovdqu (%rdi), %xmm0
1189 VPTESTM %xmm0, %xmm0, %k2
1190 CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
1191 kmovd %k1, %ecx
1192 # ifdef USE_AS_WCSCMP
1193 subl $0xf, %ecx
1194 # else
1195 incw %cx
1196 # endif
1197 jnz L(check_ret_vec_page_cross)
1198 movl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1199 # ifdef USE_AS_STRNCMP
1200 cmpq %OFFSET_REG64, %rdx
1201 jbe L(ret_zero_page_cross_slow_case0)
1202 subl %eax, %OFFSET_REG
1203 # else
1204 /* Explicit check for 16 byte alignment. */
1205 subl %eax, %OFFSET_REG
1206 jz L(prepare_loop)
1207 # endif
1208 vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1209 VPTESTM %xmm0, %xmm0, %k2
1210 CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
1211 kmovd %k1, %ecx
1212 # ifdef USE_AS_WCSCMP
1213 subl $0xf, %ecx
1214 # else
1215 incw %cx
1216 # endif
1217 jnz L(check_ret_vec_page_cross)
1218 # ifdef USE_AS_STRNCMP
1219 addl $(16 / SIZE_OF_CHAR), %OFFSET_REG
1220 subq %OFFSET_REG64, %rdx
1221 jbe L(ret_zero_page_cross_slow_case0)
1222 subq $-(CHAR_PER_VEC * 4), %rdx
1223
1224 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1225 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1226 # else
1227 leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1228 leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1229 # endif
1230 jmp L(prepare_loop_aligned)
1231
1232 # ifdef USE_AS_STRNCMP
1233 .p2align 4,, 2
1234 L(ret_zero_page_cross_slow_case0):
1235 xorl %eax, %eax
1236 ret
1237 # endif
1238
1239
1240 .p2align 4,, 10
1241 L(less_16_till_page):
1242 cmpl $(24 / SIZE_OF_CHAR), %eax
1243 ja L(less_8_till_page)
1244
1245 /* Use 8 byte comparison. */
1246 vmovq (%rdi), %xmm0
1247 vmovq (%rsi), %xmm1
1248 VPTESTM %xmm0, %xmm0, %k2
1249 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1250 kmovd %k1, %ecx
1251 # ifdef USE_AS_WCSCMP
1252 subl $0x3, %ecx
1253 # else
1254 incb %cl
1255 # endif
1256 jnz L(check_ret_vec_page_cross)
1257
1258
1259 # ifdef USE_AS_STRNCMP
1260 cmpq $(8 / SIZE_OF_CHAR), %rdx
1261 jbe L(ret_zero_page_cross_slow_case0)
1262 # endif
1263 movl $(24 / SIZE_OF_CHAR), %OFFSET_REG
1264 subl %eax, %OFFSET_REG
1265
1266 vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1267 vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1268 VPTESTM %xmm0, %xmm0, %k2
1269 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1270 kmovd %k1, %ecx
1271 # ifdef USE_AS_WCSCMP
1272 subl $0x3, %ecx
1273 # else
1274 incb %cl
1275 # endif
1276 jnz L(check_ret_vec_page_cross)
1277
1278
1279 # ifdef USE_AS_STRNCMP
1280 addl $(8 / SIZE_OF_CHAR), %OFFSET_REG
1281 subq %OFFSET_REG64, %rdx
1282 jbe L(ret_zero_page_cross_slow_case0)
1283 subq $-(CHAR_PER_VEC * 4), %rdx
1284
1285 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1286 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1287 # else
1288 leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1289 leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1290 # endif
1291 jmp L(prepare_loop_aligned)
1292
1293
1294
1295
1296 .p2align 4,, 10
1297 L(less_8_till_page):
1298 # ifdef USE_AS_WCSCMP
1299 /* If using wchar then this is the only check before we reach
1300 the page boundary. */
1301 movl (%rdi), %eax
1302 movl (%rsi), %ecx
1303 cmpl %ecx, %eax
1304 jnz L(ret_less_8_wcs)
1305 # ifdef USE_AS_STRNCMP
1306 addq $-(CHAR_PER_VEC * 2), %rdx
1307 /* We already checked for len <= 1 so cannot hit that case here.
1308 */
1309 # endif
1310 testl %eax, %eax
1311 jnz L(prepare_loop)
1312 ret
1313
1314 .p2align 4,, 8
1315 L(ret_less_8_wcs):
1316 setl %OFFSET_REG8
1317 negl %OFFSET_REG
1318 movl %OFFSET_REG, %eax
1319 xorl %r8d, %eax
1320 ret
1321
1322 # else
1323 cmpl $28, %eax
1324 ja L(less_4_till_page)
1325
1326 vmovd (%rdi), %xmm0
1327 vmovd (%rsi), %xmm1
1328 VPTESTM %xmm0, %xmm0, %k2
1329 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1330 kmovd %k1, %ecx
1331 subl $0xf, %ecx
1332 jnz L(check_ret_vec_page_cross)
1333
1334 # ifdef USE_AS_STRNCMP
1335 cmpq $4, %rdx
1336 jbe L(ret_zero_page_cross_slow_case1)
1337 # endif
1338 movl $(28 / SIZE_OF_CHAR), %OFFSET_REG
1339 subl %eax, %OFFSET_REG
1340
1341 vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
1342 vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
1343 VPTESTM %xmm0, %xmm0, %k2
1344 CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
1345 kmovd %k1, %ecx
1346 subl $0xf, %ecx
1347 jnz L(check_ret_vec_page_cross)
1348 # ifdef USE_AS_STRNCMP
1349 addl $(4 / SIZE_OF_CHAR), %OFFSET_REG
1350 subq %OFFSET_REG64, %rdx
1351 jbe L(ret_zero_page_cross_slow_case1)
1352 subq $-(CHAR_PER_VEC * 4), %rdx
1353
1354 leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1355 leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1356 # else
1357 leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
1358 leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
1359 # endif
1360 jmp L(prepare_loop_aligned)
1361
1362
1363 # ifdef USE_AS_STRNCMP
1364 .p2align 4,, 2
1365 L(ret_zero_page_cross_slow_case1):
1366 xorl %eax, %eax
1367 ret
1368 # endif
1369
1370 .p2align 4,, 10
1371 L(less_4_till_page):
1372 subq %rdi, %rsi
1373 /* Extremely slow byte comparison loop. */
1374 L(less_4_loop):
1375 movzbl (%rdi), %eax
1376 movzbl (%rsi, %rdi), %ecx
1377 TOLOWER_gpr (%rax, %eax)
1378 TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
1379 subl %BYTE_LOOP_REG, %eax
1380 jnz L(ret_less_4_loop)
1381 testl %ecx, %ecx
1382 jz L(ret_zero_4_loop)
1383 # ifdef USE_AS_STRNCMP
1384 decq %rdx
1385 jz L(ret_zero_4_loop)
1386 # endif
1387 incq %rdi
1388 /* end condition is reach page boundary (rdi is aligned). */
1389 testl $31, %edi
1390 jnz L(less_4_loop)
1391 leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
1392 addq $-(VEC_SIZE * 4), %rdi
1393 # ifdef USE_AS_STRNCMP
1394 subq $-(CHAR_PER_VEC * 4), %rdx
1395 # endif
1396 jmp L(prepare_loop_aligned)
1397
1398 L(ret_zero_4_loop):
1399 xorl %eax, %eax
1400 ret
1401 L(ret_less_4_loop):
1402 xorl %r8d, %eax
1403 subl %r8d, %eax
1404 ret
1405 # endif
1406 cfi_endproc
1407 .size STRCMP, .-STRCMP
1408 #endif