1 /* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
19 #include <isa-level.h>
21 #if ISA_SHOULD_BUILD (3)
26 # define STRLEN __strlen_avx2
30 # define VPCMPEQ vpcmpeqd
31 # define VPMINU vpminud
34 # define VPCMPEQ vpcmpeqb
35 # define VPMINU vpminub
40 # define VZEROUPPER vzeroupper
44 # define SECTION(p) p##.avx
48 # define PAGE_SIZE 4096
49 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
51 .section SECTION(.text),"ax",@progbits
53 # ifdef USE_AS_STRNLEN
54 /* Check zero length. */
56 /* Clear upper bits. */
62 /* Store max len in R8_LP before adjusting if using WCSLEN. */
67 vpxor %xmm0, %xmm0, %xmm0
68 /* Clear high bits from edi. Only keeping bits relevant to page
70 andl $(PAGE_SIZE - 1), %eax
71 /* Check if we may cross page boundary with one vector load. */
72 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
73 ja L(cross_page_boundary)
75 /* Check the first VEC_SIZE bytes. */
76 VPCMPEQ (%rdi), %ymm0, %ymm1
78 # ifdef USE_AS_STRNLEN
79 /* If length < VEC_SIZE handle special. */
80 cmpq $CHAR_PER_VEC, %rsi
83 /* If empty continue to aligned_more. Otherwise return bit
84 position of first match. */
89 /* NB: Divide bytes by 4 to get wchar_t count. */
94 # ifdef USE_AS_STRNLEN
101 /* Set bit for max len so that tzcnt will return min of max len
102 and position of first match. */
103 # ifdef USE_AS_WCSLEN
104 /* NB: Multiply length by 4 to get byte count. */
109 # ifdef USE_AS_WCSLEN
110 /* NB: Divide bytes by 4 to get wchar_t count. */
119 /* Safe to use 32 bit instructions as these are only called for
121 # ifdef USE_AS_STRNLEN
122 /* Use ecx which was computed earlier to compute correct value.
124 # ifdef USE_AS_WCSLEN
125 leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
127 subl $(VEC_SIZE * 4 + 1), %ecx
135 # ifdef USE_AS_WCSLEN
136 /* NB: Divide bytes by 4 to get wchar_t count. */
144 /* Safe to use 32 bit instructions as these are only called for
146 # ifdef USE_AS_STRNLEN
147 /* Use ecx which was computed earlier to compute correct value.
149 # ifdef USE_AS_WCSLEN
150 leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
152 subl $(VEC_SIZE * 3 + 1), %ecx
157 addl $(VEC_SIZE + 1), %edi
160 # ifdef USE_AS_WCSLEN
161 /* NB: Divide bytes by 4 to get wchar_t count. */
169 /* Safe to use 32 bit instructions as these are only called for
171 # ifdef USE_AS_STRNLEN
172 /* Use ecx which was computed earlier to compute correct value.
174 # ifdef USE_AS_WCSLEN
175 leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
177 subl $(VEC_SIZE * 2 + 1), %ecx
182 addl $(VEC_SIZE * 2 + 1), %edi
185 # ifdef USE_AS_WCSLEN
186 /* NB: Divide bytes by 4 to get wchar_t count. */
194 /* Safe to use 32 bit instructions as these are only called for
196 # ifdef USE_AS_STRNLEN
197 /* Use ecx which was computed earlier to compute correct value.
199 # ifdef USE_AS_WCSLEN
200 leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
202 subl $(VEC_SIZE + 1), %ecx
207 addl $(VEC_SIZE * 3 + 1), %edi
210 # ifdef USE_AS_WCSLEN
211 /* NB: Divide bytes by 4 to get wchar_t count. */
218 /* Align data to VEC_SIZE - 1. This is the same number of
219 instructions as using andq with -VEC_SIZE but saves 4 bytes of
220 code on the x4 check. */
221 orq $(VEC_SIZE - 1), %rdi
222 L(cross_page_continue):
223 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
224 since data is only aligned to VEC_SIZE. */
225 # ifdef USE_AS_STRNLEN
226 /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
227 because it simplies the logic in last_4x_vec_or_less. */
228 leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
230 # ifdef USE_AS_WCSLEN
231 /* NB: Divide bytes by 4 to get the wchar_t count. */
235 /* Load first VEC regardless. */
236 VPCMPEQ 1(%rdi), %ymm0, %ymm1
237 # ifdef USE_AS_STRNLEN
238 /* Adjust length. If near end handle specially. */
240 jb L(last_4x_vec_or_less)
242 vpmovmskb %ymm1, %eax
246 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
247 vpmovmskb %ymm1, %eax
251 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
252 vpmovmskb %ymm1, %eax
256 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
257 vpmovmskb %ymm1, %eax
261 /* Align data to VEC_SIZE * 4 - 1. */
262 # ifdef USE_AS_STRNLEN
263 /* Before adjusting length check if at last VEC_SIZE * 4. */
264 cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
265 jbe L(last_4x_vec_or_less_load)
268 orq $(VEC_SIZE * 4 - 1), %rdi
269 andl $(VEC_SIZE * 4 - 1), %ecx
270 # ifdef USE_AS_WCSLEN
271 /* NB: Divide bytes by 4 to get the wchar_t count. */
274 /* Readjust length. */
278 orq $(VEC_SIZE * 4 - 1), %rdi
280 /* Compare 4 * VEC at a time forward. */
283 # ifdef USE_AS_STRNLEN
284 /* Break if at end of length. */
285 subq $(CHAR_PER_VEC * 4), %rsi
286 jb L(last_4x_vec_or_less_cmpeq)
288 /* Save some code size by microfusing VPMINU with the load.
289 Since the matches in ymm2/ymm4 can only be returned if there
290 where no matches in ymm1/ymm3 respectively there is no issue
292 vmovdqa 1(%rdi), %ymm1
293 VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
294 vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
295 VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
297 VPMINU %ymm2, %ymm4, %ymm5
298 VPCMPEQ %ymm5, %ymm0, %ymm5
299 vpmovmskb %ymm5, %ecx
301 subq $-(VEC_SIZE * 4), %rdi
306 VPCMPEQ %ymm1, %ymm0, %ymm1
307 vpmovmskb %ymm1, %eax
310 jnz L(last_vec_return_x0)
312 VPCMPEQ %ymm2, %ymm0, %ymm2
313 vpmovmskb %ymm2, %eax
315 jnz L(last_vec_return_x1)
317 /* Combine last 2 VEC. */
318 VPCMPEQ %ymm3, %ymm0, %ymm3
319 vpmovmskb %ymm3, %eax
320 /* rcx has combined result from all 4 VEC. It will only be used
321 if the first 3 other VEC all did not contain a match. */
325 subq $(VEC_SIZE * 2 - 1), %rdi
327 # ifdef USE_AS_WCSLEN
328 /* NB: Divide bytes by 4 to get wchar_t count. */
334 # ifdef USE_AS_STRNLEN
336 L(last_4x_vec_or_less_load):
337 /* Depending on entry adjust rdi / prepare first VEC in ymm1.
339 subq $-(VEC_SIZE * 4), %rdi
340 L(last_4x_vec_or_less_cmpeq):
341 VPCMPEQ 1(%rdi), %ymm0, %ymm1
342 L(last_4x_vec_or_less):
343 # ifdef USE_AS_WCSLEN
344 /* NB: Multiply length by 4 to get byte count. */
347 vpmovmskb %ymm1, %eax
348 /* If remaining length > VEC_SIZE * 2. This works if esi is off
350 testl $(VEC_SIZE * 2), %esi
353 /* length may have been negative or positive by an offset of
354 VEC_SIZE * 4 depending on where this was called from. This fixes
356 andl $(VEC_SIZE * 4 - 1), %esi
358 jnz L(last_vec_x1_check)
363 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
364 vpmovmskb %ymm1, %eax
366 /* Check the end of data. */
370 addl $(VEC_SIZE + 1), %eax
372 # ifdef USE_AS_WCSLEN
373 /* NB: Divide bytes by 4 to get wchar_t count. */
380 L(last_vec_return_x0):
382 subq $(VEC_SIZE * 4 - 1), %rdi
384 # ifdef USE_AS_WCSLEN
385 /* NB: Divide bytes by 4 to get wchar_t count. */
391 L(last_vec_return_x1):
393 subq $(VEC_SIZE * 3 - 1), %rdi
395 # ifdef USE_AS_WCSLEN
396 /* NB: Divide bytes by 4 to get wchar_t count. */
401 # ifdef USE_AS_STRNLEN
403 L(last_vec_x1_check):
406 /* Check the end of data. */
412 # ifdef USE_AS_WCSLEN
413 /* NB: Divide bytes by 4 to get wchar_t count. */
424 /* Test first 2x VEC normally. */
428 VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
429 vpmovmskb %ymm1, %eax
433 /* Normalize length. */
434 andl $(VEC_SIZE * 4 - 1), %esi
435 VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
436 vpmovmskb %ymm1, %eax
440 subl $(VEC_SIZE * 3), %esi
443 VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
444 vpmovmskb %ymm1, %eax
446 /* Check the end of data. */
450 addl $(VEC_SIZE * 3 + 1), %eax
452 # ifdef USE_AS_WCSLEN
453 /* NB: Divide bytes by 4 to get wchar_t count. */
461 /* essentially duplicates of first_vec_x1 but use 64 bit
467 # ifdef USE_AS_WCSLEN
468 /* NB: Divide bytes by 4 to get wchar_t count. */
475 /* essentially duplicates of first_vec_x1 but use 64 bit
479 addl $(VEC_SIZE + 1), %eax
481 # ifdef USE_AS_WCSLEN
482 /* NB: Divide bytes by 4 to get wchar_t count. */
490 subl $(VEC_SIZE * 2), %esi
491 /* Check the end of data. */
495 addl $(VEC_SIZE * 2 + 1), %eax
497 # ifdef USE_AS_WCSLEN
498 /* NB: Divide bytes by 4 to get wchar_t count. */
507 /* Cold case for crossing page with first load. */
509 L(cross_page_boundary):
510 /* Align data to VEC_SIZE - 1. */
511 orq $(VEC_SIZE - 1), %rdi
512 VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
513 vpmovmskb %ymm1, %eax
514 /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
515 so no need to manually mod rdx. */
516 sarxl %edx, %eax, %eax
517 # ifdef USE_AS_STRNLEN
519 jnz L(cross_page_less_vec)
522 # ifdef USE_AS_WCSLEN
523 /* NB: Divide bytes by 4 to get wchar_t count. */
528 jb L(cross_page_continue)
532 jz L(cross_page_continue)
534 # ifdef USE_AS_WCSLEN
535 /* NB: Divide length by 4 to get wchar_t count. */
539 L(return_vzeroupper):
540 ZERO_UPPER_VEC_REGISTERS_RETURN
542 # ifdef USE_AS_STRNLEN
544 L(cross_page_less_vec):
546 # ifdef USE_AS_WCSLEN
547 /* NB: Multiply length by 4 to get byte count. */
552 # ifdef USE_AS_WCSLEN