initial commit
[glibc.git] / .pc / git-updates.diff / sysdeps / x86_64 / multiarch / strlen-avx2.S
1 /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <isa-level.h>
20
21 #if ISA_SHOULD_BUILD (3)
22
23 # include <sysdep.h>
24
25 # ifndef STRLEN
26 # define STRLEN __strlen_avx2
27 # endif
28
29 # ifdef USE_AS_WCSLEN
30 # define VPCMPEQ vpcmpeqd
31 # define VPMINU vpminud
32 # define CHAR_SIZE 4
33 # else
34 # define VPCMPEQ vpcmpeqb
35 # define VPMINU vpminub
36 # define CHAR_SIZE 1
37 # endif
38
39 # ifndef VZEROUPPER
40 # define VZEROUPPER vzeroupper
41 # endif
42
43 # ifndef SECTION
44 # define SECTION(p) p##.avx
45 # endif
46
47 # define VEC_SIZE 32
48 # define PAGE_SIZE 4096
49 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
50
51 .section SECTION(.text),"ax",@progbits
52 ENTRY (STRLEN)
53 # ifdef USE_AS_STRNLEN
54 /* Check zero length. */
55 # ifdef __ILP32__
56 /* Clear upper bits. */
57 and %RSI_LP, %RSI_LP
58 # else
59 test %RSI_LP, %RSI_LP
60 # endif
61 jz L(zero)
62 /* Store max len in R8_LP before adjusting if using WCSLEN. */
63 mov %RSI_LP, %R8_LP
64 # endif
65 movl %edi, %eax
66 movq %rdi, %rdx
67 vpxor %xmm0, %xmm0, %xmm0
68 /* Clear high bits from edi. Only keeping bits relevant to page
69 cross check. */
70 andl $(PAGE_SIZE - 1), %eax
71 /* Check if we may cross page boundary with one vector load. */
72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
73 ja L(cross_page_boundary)
74
75 /* Check the first VEC_SIZE bytes. */
76 VPCMPEQ (%rdi), %ymm0, %ymm1
77 vpmovmskb %ymm1, %eax
78 # ifdef USE_AS_STRNLEN
79 /* If length < VEC_SIZE handle special. */
80 cmpq $CHAR_PER_VEC, %rsi
81 jbe L(first_vec_x0)
82 # endif
83 /* If empty continue to aligned_more. Otherwise return bit
84 position of first match. */
85 testl %eax, %eax
86 jz L(aligned_more)
87 tzcntl %eax, %eax
88 # ifdef USE_AS_WCSLEN
89 /* NB: Divide bytes by 4 to get wchar_t count. */
90 shrl $2, %eax
91 # endif
92 VZEROUPPER_RETURN
93
94 # ifdef USE_AS_STRNLEN
95 L(zero):
96 xorl %eax, %eax
97 ret
98
99 .p2align 4
100 L(first_vec_x0):
101 /* Set bit for max len so that tzcnt will return min of max len
102 and position of first match. */
103 # ifdef USE_AS_WCSLEN
104 /* NB: Multiply length by 4 to get byte count. */
105 sall $2, %esi
106 # endif
107 btsq %rsi, %rax
108 tzcntl %eax, %eax
109 # ifdef USE_AS_WCSLEN
110 /* NB: Divide bytes by 4 to get wchar_t count. */
111 shrl $2, %eax
112 # endif
113 VZEROUPPER_RETURN
114 # endif
115
116 .p2align 4
117 L(first_vec_x1):
118 tzcntl %eax, %eax
119 /* Safe to use 32 bit instructions as these are only called for
120 size = [1, 159]. */
121 # ifdef USE_AS_STRNLEN
122 /* Use ecx which was computed earlier to compute correct value.
123 */
124 # ifdef USE_AS_WCSLEN
125 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
126 # else
127 subl $(VEC_SIZE * 4 + 1), %ecx
128 addl %ecx, %eax
129 # endif
130 # else
131 subl %edx, %edi
132 incl %edi
133 addl %edi, %eax
134 # endif
135 # ifdef USE_AS_WCSLEN
136 /* NB: Divide bytes by 4 to get wchar_t count. */
137 shrl $2, %eax
138 # endif
139 VZEROUPPER_RETURN
140
141 .p2align 4
142 L(first_vec_x2):
143 tzcntl %eax, %eax
144 /* Safe to use 32 bit instructions as these are only called for
145 size = [1, 159]. */
146 # ifdef USE_AS_STRNLEN
147 /* Use ecx which was computed earlier to compute correct value.
148 */
149 # ifdef USE_AS_WCSLEN
150 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
151 # else
152 subl $(VEC_SIZE * 3 + 1), %ecx
153 addl %ecx, %eax
154 # endif
155 # else
156 subl %edx, %edi
157 addl $(VEC_SIZE + 1), %edi
158 addl %edi, %eax
159 # endif
160 # ifdef USE_AS_WCSLEN
161 /* NB: Divide bytes by 4 to get wchar_t count. */
162 shrl $2, %eax
163 # endif
164 VZEROUPPER_RETURN
165
166 .p2align 4
167 L(first_vec_x3):
168 tzcntl %eax, %eax
169 /* Safe to use 32 bit instructions as these are only called for
170 size = [1, 159]. */
171 # ifdef USE_AS_STRNLEN
172 /* Use ecx which was computed earlier to compute correct value.
173 */
174 # ifdef USE_AS_WCSLEN
175 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
176 # else
177 subl $(VEC_SIZE * 2 + 1), %ecx
178 addl %ecx, %eax
179 # endif
180 # else
181 subl %edx, %edi
182 addl $(VEC_SIZE * 2 + 1), %edi
183 addl %edi, %eax
184 # endif
185 # ifdef USE_AS_WCSLEN
186 /* NB: Divide bytes by 4 to get wchar_t count. */
187 shrl $2, %eax
188 # endif
189 VZEROUPPER_RETURN
190
191 .p2align 4
192 L(first_vec_x4):
193 tzcntl %eax, %eax
194 /* Safe to use 32 bit instructions as these are only called for
195 size = [1, 159]. */
196 # ifdef USE_AS_STRNLEN
197 /* Use ecx which was computed earlier to compute correct value.
198 */
199 # ifdef USE_AS_WCSLEN
200 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
201 # else
202 subl $(VEC_SIZE + 1), %ecx
203 addl %ecx, %eax
204 # endif
205 # else
206 subl %edx, %edi
207 addl $(VEC_SIZE * 3 + 1), %edi
208 addl %edi, %eax
209 # endif
210 # ifdef USE_AS_WCSLEN
211 /* NB: Divide bytes by 4 to get wchar_t count. */
212 shrl $2, %eax
213 # endif
214 VZEROUPPER_RETURN
215
216 .p2align 5
217 L(aligned_more):
218 /* Align data to VEC_SIZE - 1. This is the same number of
219 instructions as using andq with -VEC_SIZE but saves 4 bytes of
220 code on the x4 check. */
221 orq $(VEC_SIZE - 1), %rdi
222 L(cross_page_continue):
223 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
224 since data is only aligned to VEC_SIZE. */
225 # ifdef USE_AS_STRNLEN
226 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
227 because it simplies the logic in last_4x_vec_or_less. */
228 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
229 subq %rdx, %rcx
230 # ifdef USE_AS_WCSLEN
231 /* NB: Divide bytes by 4 to get the wchar_t count. */
232 sarl $2, %ecx
233 # endif
234 # endif
235 /* Load first VEC regardless. */
236 VPCMPEQ 1(%rdi), %ymm0, %ymm1
237 # ifdef USE_AS_STRNLEN
238 /* Adjust length. If near end handle specially. */
239 subq %rcx, %rsi
240 jb L(last_4x_vec_or_less)
241 # endif
242 vpmovmskb %ymm1, %eax
243 testl %eax, %eax
244 jnz L(first_vec_x1)
245
246 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
247 vpmovmskb %ymm1, %eax
248 testl %eax, %eax
249 jnz L(first_vec_x2)
250
251 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
252 vpmovmskb %ymm1, %eax
253 testl %eax, %eax
254 jnz L(first_vec_x3)
255
256 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
257 vpmovmskb %ymm1, %eax
258 testl %eax, %eax
259 jnz L(first_vec_x4)
260
261 /* Align data to VEC_SIZE * 4 - 1. */
262 # ifdef USE_AS_STRNLEN
263 /* Before adjusting length check if at last VEC_SIZE * 4. */
264 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
265 jbe L(last_4x_vec_or_less_load)
266 incq %rdi
267 movl %edi, %ecx
268 orq $(VEC_SIZE * 4 - 1), %rdi
269 andl $(VEC_SIZE * 4 - 1), %ecx
270 # ifdef USE_AS_WCSLEN
271 /* NB: Divide bytes by 4 to get the wchar_t count. */
272 sarl $2, %ecx
273 # endif
274 /* Readjust length. */
275 addq %rcx, %rsi
276 # else
277 incq %rdi
278 orq $(VEC_SIZE * 4 - 1), %rdi
279 # endif
280 /* Compare 4 * VEC at a time forward. */
281 .p2align 4
282 L(loop_4x_vec):
283 # ifdef USE_AS_STRNLEN
284 /* Break if at end of length. */
285 subq $(CHAR_PER_VEC * 4), %rsi
286 jb L(last_4x_vec_or_less_cmpeq)
287 # endif
288 /* Save some code size by microfusing VPMINU with the load.
289 Since the matches in ymm2/ymm4 can only be returned if there
290 where no matches in ymm1/ymm3 respectively there is no issue
291 with overlap. */
292 vmovdqa 1(%rdi), %ymm1
293 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
294 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
295 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
296
297 VPMINU %ymm2, %ymm4, %ymm5
298 VPCMPEQ %ymm5, %ymm0, %ymm5
299 vpmovmskb %ymm5, %ecx
300
301 subq $-(VEC_SIZE * 4), %rdi
302 testl %ecx, %ecx
303 jz L(loop_4x_vec)
304
305
306 VPCMPEQ %ymm1, %ymm0, %ymm1
307 vpmovmskb %ymm1, %eax
308 subq %rdx, %rdi
309 testl %eax, %eax
310 jnz L(last_vec_return_x0)
311
312 VPCMPEQ %ymm2, %ymm0, %ymm2
313 vpmovmskb %ymm2, %eax
314 testl %eax, %eax
315 jnz L(last_vec_return_x1)
316
317 /* Combine last 2 VEC. */
318 VPCMPEQ %ymm3, %ymm0, %ymm3
319 vpmovmskb %ymm3, %eax
320 /* rcx has combined result from all 4 VEC. It will only be used
321 if the first 3 other VEC all did not contain a match. */
322 salq $32, %rcx
323 orq %rcx, %rax
324 tzcntq %rax, %rax
325 subq $(VEC_SIZE * 2 - 1), %rdi
326 addq %rdi, %rax
327 # ifdef USE_AS_WCSLEN
328 /* NB: Divide bytes by 4 to get wchar_t count. */
329 shrq $2, %rax
330 # endif
331 VZEROUPPER_RETURN
332
333
334 # ifdef USE_AS_STRNLEN
335 .p2align 4
336 L(last_4x_vec_or_less_load):
337 /* Depending on entry adjust rdi / prepare first VEC in ymm1.
338 */
339 subq $-(VEC_SIZE * 4), %rdi
340 L(last_4x_vec_or_less_cmpeq):
341 VPCMPEQ 1(%rdi), %ymm0, %ymm1
342 L(last_4x_vec_or_less):
343 # ifdef USE_AS_WCSLEN
344 /* NB: Multiply length by 4 to get byte count. */
345 sall $2, %esi
346 # endif
347 vpmovmskb %ymm1, %eax
348 /* If remaining length > VEC_SIZE * 2. This works if esi is off
349 by VEC_SIZE * 4. */
350 testl $(VEC_SIZE * 2), %esi
351 jnz L(last_4x_vec)
352
353 /* length may have been negative or positive by an offset of
354 VEC_SIZE * 4 depending on where this was called from. This fixes
355 that. */
356 andl $(VEC_SIZE * 4 - 1), %esi
357 testl %eax, %eax
358 jnz L(last_vec_x1_check)
359
360 subl $VEC_SIZE, %esi
361 jb L(max)
362
363 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
364 vpmovmskb %ymm1, %eax
365 tzcntl %eax, %eax
366 /* Check the end of data. */
367 cmpl %eax, %esi
368 jb L(max)
369 subq %rdx, %rdi
370 addl $(VEC_SIZE + 1), %eax
371 addq %rdi, %rax
372 # ifdef USE_AS_WCSLEN
373 /* NB: Divide bytes by 4 to get wchar_t count. */
374 shrq $2, %rax
375 # endif
376 VZEROUPPER_RETURN
377 # endif
378
379 .p2align 4
380 L(last_vec_return_x0):
381 tzcntl %eax, %eax
382 subq $(VEC_SIZE * 4 - 1), %rdi
383 addq %rdi, %rax
384 # ifdef USE_AS_WCSLEN
385 /* NB: Divide bytes by 4 to get wchar_t count. */
386 shrq $2, %rax
387 # endif
388 VZEROUPPER_RETURN
389
390 .p2align 4
391 L(last_vec_return_x1):
392 tzcntl %eax, %eax
393 subq $(VEC_SIZE * 3 - 1), %rdi
394 addq %rdi, %rax
395 # ifdef USE_AS_WCSLEN
396 /* NB: Divide bytes by 4 to get wchar_t count. */
397 shrq $2, %rax
398 # endif
399 VZEROUPPER_RETURN
400
401 # ifdef USE_AS_STRNLEN
402 .p2align 4
403 L(last_vec_x1_check):
404
405 tzcntl %eax, %eax
406 /* Check the end of data. */
407 cmpl %eax, %esi
408 jb L(max)
409 subq %rdx, %rdi
410 incl %eax
411 addq %rdi, %rax
412 # ifdef USE_AS_WCSLEN
413 /* NB: Divide bytes by 4 to get wchar_t count. */
414 shrq $2, %rax
415 # endif
416 VZEROUPPER_RETURN
417
418 L(max):
419 movq %r8, %rax
420 VZEROUPPER_RETURN
421
422 .p2align 4
423 L(last_4x_vec):
424 /* Test first 2x VEC normally. */
425 testl %eax, %eax
426 jnz L(last_vec_x1)
427
428 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
429 vpmovmskb %ymm1, %eax
430 testl %eax, %eax
431 jnz L(last_vec_x2)
432
433 /* Normalize length. */
434 andl $(VEC_SIZE * 4 - 1), %esi
435 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
436 vpmovmskb %ymm1, %eax
437 testl %eax, %eax
438 jnz L(last_vec_x3)
439
440 subl $(VEC_SIZE * 3), %esi
441 jb L(max)
442
443 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
444 vpmovmskb %ymm1, %eax
445 tzcntl %eax, %eax
446 /* Check the end of data. */
447 cmpl %eax, %esi
448 jb L(max)
449 subq %rdx, %rdi
450 addl $(VEC_SIZE * 3 + 1), %eax
451 addq %rdi, %rax
452 # ifdef USE_AS_WCSLEN
453 /* NB: Divide bytes by 4 to get wchar_t count. */
454 shrq $2, %rax
455 # endif
456 VZEROUPPER_RETURN
457
458
459 .p2align 4
460 L(last_vec_x1):
461 /* essentially duplicates of first_vec_x1 but use 64 bit
462 instructions. */
463 tzcntl %eax, %eax
464 subq %rdx, %rdi
465 incl %eax
466 addq %rdi, %rax
467 # ifdef USE_AS_WCSLEN
468 /* NB: Divide bytes by 4 to get wchar_t count. */
469 shrq $2, %rax
470 # endif
471 VZEROUPPER_RETURN
472
473 .p2align 4
474 L(last_vec_x2):
475 /* essentially duplicates of first_vec_x1 but use 64 bit
476 instructions. */
477 tzcntl %eax, %eax
478 subq %rdx, %rdi
479 addl $(VEC_SIZE + 1), %eax
480 addq %rdi, %rax
481 # ifdef USE_AS_WCSLEN
482 /* NB: Divide bytes by 4 to get wchar_t count. */
483 shrq $2, %rax
484 # endif
485 VZEROUPPER_RETURN
486
487 .p2align 4
488 L(last_vec_x3):
489 tzcntl %eax, %eax
490 subl $(VEC_SIZE * 2), %esi
491 /* Check the end of data. */
492 cmpl %eax, %esi
493 jb L(max_end)
494 subq %rdx, %rdi
495 addl $(VEC_SIZE * 2 + 1), %eax
496 addq %rdi, %rax
497 # ifdef USE_AS_WCSLEN
498 /* NB: Divide bytes by 4 to get wchar_t count. */
499 shrq $2, %rax
500 # endif
501 VZEROUPPER_RETURN
502 L(max_end):
503 movq %r8, %rax
504 VZEROUPPER_RETURN
505 # endif
506
507 /* Cold case for crossing page with first load. */
508 .p2align 4
509 L(cross_page_boundary):
510 /* Align data to VEC_SIZE - 1. */
511 orq $(VEC_SIZE - 1), %rdi
512 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
513 vpmovmskb %ymm1, %eax
514 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
515 so no need to manually mod rdx. */
516 sarxl %edx, %eax, %eax
517 # ifdef USE_AS_STRNLEN
518 testl %eax, %eax
519 jnz L(cross_page_less_vec)
520 leaq 1(%rdi), %rcx
521 subq %rdx, %rcx
522 # ifdef USE_AS_WCSLEN
523 /* NB: Divide bytes by 4 to get wchar_t count. */
524 shrl $2, %ecx
525 # endif
526 /* Check length. */
527 cmpq %rsi, %rcx
528 jb L(cross_page_continue)
529 movq %r8, %rax
530 # else
531 testl %eax, %eax
532 jz L(cross_page_continue)
533 tzcntl %eax, %eax
534 # ifdef USE_AS_WCSLEN
535 /* NB: Divide length by 4 to get wchar_t count. */
536 shrl $2, %eax
537 # endif
538 # endif
539 L(return_vzeroupper):
540 ZERO_UPPER_VEC_REGISTERS_RETURN
541
542 # ifdef USE_AS_STRNLEN
543 .p2align 4
544 L(cross_page_less_vec):
545 tzcntl %eax, %eax
546 # ifdef USE_AS_WCSLEN
547 /* NB: Multiply length by 4 to get byte count. */
548 sall $2, %esi
549 # endif
550 cmpq %rax, %rsi
551 cmovb %esi, %eax
552 # ifdef USE_AS_WCSLEN
553 shrl $2, %eax
554 # endif
555 VZEROUPPER_RETURN
556 # endif
557
558 END (STRLEN)
559 #endif