initial commit
[glibc.git] / sysdeps / i386 / i686 / multiarch / memchr-sse2-bsf.S
1 /* Optimized memchr with sse2
2 Copyright (C) 2011-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #if IS_IN (libc)
20
21 # include <sysdep.h>
22
23 # define CFI_PUSH(REG) \
24 cfi_adjust_cfa_offset (4); \
25 cfi_rel_offset (REG, 0)
26
27 # define CFI_POP(REG) \
28 cfi_adjust_cfa_offset (-4); \
29 cfi_restore (REG)
30
31 # define PUSH(REG) pushl REG; CFI_PUSH (REG)
32 # define POP(REG) popl REG; CFI_POP (REG)
33
34 # define PARMS 4
35 # define STR1 PARMS
36 # define STR2 STR1+4
37
38 # ifndef USE_AS_RAWMEMCHR
39 # define LEN STR2+4
40 # define RETURN POP(%edi); ret; CFI_PUSH(%edi);
41 # endif
42
43 # ifndef MEMCHR
44 # define MEMCHR __memchr_sse2_bsf
45 # endif
46
47 .text
48 ENTRY (MEMCHR)
49
50 mov STR1(%esp), %ecx
51 movd STR2(%esp), %xmm1
52
53 # ifndef USE_AS_RAWMEMCHR
54 mov LEN(%esp), %edx
55 test %edx, %edx
56 jz L(return_null_1)
57 # endif
58 mov %ecx, %eax
59
60 punpcklbw %xmm1, %xmm1
61 punpcklbw %xmm1, %xmm1
62
63 and $63, %ecx
64 pshufd $0, %xmm1, %xmm1
65
66 cmp $48, %ecx
67 ja L(crosscache)
68
69 movdqu (%eax), %xmm0
70 pcmpeqb %xmm1, %xmm0
71 /* Check if there is a match. */
72 pmovmskb %xmm0, %ecx
73 test %ecx, %ecx
74 je L(unaligned_no_match_1)
75 /* Check which byte is a match. */
76 bsf %ecx, %ecx
77
78 # ifndef USE_AS_RAWMEMCHR
79 sub %ecx, %edx
80 jbe L(return_null_1)
81 # endif
82 add %ecx, %eax
83 ret
84
85 .p2align 4
86 L(unaligned_no_match_1):
87 # ifndef USE_AS_RAWMEMCHR
88 sub $16, %edx
89 jbe L(return_null_1)
90 PUSH (%edi)
91 lea 16(%eax), %edi
92 and $15, %eax
93 and $-16, %edi
94 add %eax, %edx
95 # else
96 lea 16(%eax), %edx
97 and $-16, %edx
98 # endif
99 jmp L(loop_prolog)
100
101 .p2align 4
102 L(return_null_1):
103 xor %eax, %eax
104 ret
105
106 # ifndef USE_AS_RAWMEMCHR
107 CFI_POP (%edi)
108 # endif
109
110 .p2align 4
111 L(crosscache):
112 /* Handle unaligned string. */
113
114 # ifndef USE_AS_RAWMEMCHR
115 PUSH (%edi)
116 mov %eax, %edi
117 and $15, %ecx
118 and $-16, %edi
119 movdqa (%edi), %xmm0
120 # else
121 mov %eax, %edx
122 and $15, %ecx
123 and $-16, %edx
124 movdqa (%edx), %xmm0
125 # endif
126 pcmpeqb %xmm1, %xmm0
127 /* Check if there is a match. */
128 pmovmskb %xmm0, %eax
129 /* Remove the leading bytes. */
130 sar %cl, %eax
131 test %eax, %eax
132 je L(unaligned_no_match)
133 /* Check which byte is a match. */
134 bsf %eax, %eax
135
136 # ifndef USE_AS_RAWMEMCHR
137 sub %eax, %edx
138 jbe L(return_null)
139 add %edi, %eax
140 add %ecx, %eax
141 RETURN
142 # else
143 add %edx, %eax
144 add %ecx, %eax
145 ret
146 # endif
147
148 .p2align 4
149 L(unaligned_no_match):
150 # ifndef USE_AS_RAWMEMCHR
151 /* Calculate the last acceptable address and check for possible
152 addition overflow by using satured math:
153 edx = ecx + edx
154 edx |= -(edx < ecx) */
155 add %ecx, %edx
156 sbb %eax, %eax
157 or %eax, %edx
158 sub $16, %edx
159 jbe L(return_null)
160 add $16, %edi
161 # else
162 add $16, %edx
163 # endif
164
165 .p2align 4
166 /* Loop start on aligned string. */
167 L(loop_prolog):
168 # ifndef USE_AS_RAWMEMCHR
169 sub $64, %edx
170 jbe L(exit_loop)
171 movdqa (%edi), %xmm0
172 # else
173 movdqa (%edx), %xmm0
174 # endif
175 pcmpeqb %xmm1, %xmm0
176 pmovmskb %xmm0, %eax
177 test %eax, %eax
178 jnz L(matches)
179
180 # ifndef USE_AS_RAWMEMCHR
181 movdqa 16(%edi), %xmm2
182 # else
183 movdqa 16(%edx), %xmm2
184 # endif
185 pcmpeqb %xmm1, %xmm2
186 pmovmskb %xmm2, %eax
187 test %eax, %eax
188 jnz L(matches16)
189
190 # ifndef USE_AS_RAWMEMCHR
191 movdqa 32(%edi), %xmm3
192 # else
193 movdqa 32(%edx), %xmm3
194 # endif
195 pcmpeqb %xmm1, %xmm3
196 pmovmskb %xmm3, %eax
197 test %eax, %eax
198 jnz L(matches32)
199
200 # ifndef USE_AS_RAWMEMCHR
201 movdqa 48(%edi), %xmm4
202 # else
203 movdqa 48(%edx), %xmm4
204 # endif
205 pcmpeqb %xmm1, %xmm4
206
207 # ifndef USE_AS_RAWMEMCHR
208 add $64, %edi
209 # else
210 add $64, %edx
211 # endif
212 pmovmskb %xmm4, %eax
213 test %eax, %eax
214 jnz L(matches0)
215
216 # ifndef USE_AS_RAWMEMCHR
217 test $0x3f, %edi
218 # else
219 test $0x3f, %edx
220 # endif
221 jz L(align64_loop)
222
223 # ifndef USE_AS_RAWMEMCHR
224 sub $64, %edx
225 jbe L(exit_loop)
226 movdqa (%edi), %xmm0
227 # else
228 movdqa (%edx), %xmm0
229 # endif
230 pcmpeqb %xmm1, %xmm0
231 pmovmskb %xmm0, %eax
232 test %eax, %eax
233 jnz L(matches)
234
235 # ifndef USE_AS_RAWMEMCHR
236 movdqa 16(%edi), %xmm2
237 # else
238 movdqa 16(%edx), %xmm2
239 # endif
240 pcmpeqb %xmm1, %xmm2
241 pmovmskb %xmm2, %eax
242 test %eax, %eax
243 jnz L(matches16)
244
245 # ifndef USE_AS_RAWMEMCHR
246 movdqa 32(%edi), %xmm3
247 # else
248 movdqa 32(%edx), %xmm3
249 # endif
250 pcmpeqb %xmm1, %xmm3
251 pmovmskb %xmm3, %eax
252 test %eax, %eax
253 jnz L(matches32)
254
255 # ifndef USE_AS_RAWMEMCHR
256 movdqa 48(%edi), %xmm3
257 # else
258 movdqa 48(%edx), %xmm3
259 # endif
260 pcmpeqb %xmm1, %xmm3
261 pmovmskb %xmm3, %eax
262
263 # ifndef USE_AS_RAWMEMCHR
264 add $64, %edi
265 # else
266 add $64, %edx
267 # endif
268 test %eax, %eax
269 jnz L(matches0)
270
271 # ifndef USE_AS_RAWMEMCHR
272 mov %edi, %ecx
273 and $-64, %edi
274 and $63, %ecx
275 add %ecx, %edx
276 # else
277 and $-64, %edx
278 # endif
279
280 .p2align 4
281 L(align64_loop):
282 # ifndef USE_AS_RAWMEMCHR
283 sub $64, %edx
284 jbe L(exit_loop)
285 movdqa (%edi), %xmm0
286 movdqa 16(%edi), %xmm2
287 movdqa 32(%edi), %xmm3
288 movdqa 48(%edi), %xmm4
289 # else
290 movdqa (%edx), %xmm0
291 movdqa 16(%edx), %xmm2
292 movdqa 32(%edx), %xmm3
293 movdqa 48(%edx), %xmm4
294 # endif
295 pcmpeqb %xmm1, %xmm0
296 pcmpeqb %xmm1, %xmm2
297 pcmpeqb %xmm1, %xmm3
298 pcmpeqb %xmm1, %xmm4
299
300 pmaxub %xmm0, %xmm3
301 pmaxub %xmm2, %xmm4
302 pmaxub %xmm3, %xmm4
303 pmovmskb %xmm4, %eax
304
305 # ifndef USE_AS_RAWMEMCHR
306 add $64, %edi
307 # else
308 add $64, %edx
309 # endif
310
311 test %eax, %eax
312 jz L(align64_loop)
313
314 # ifndef USE_AS_RAWMEMCHR
315 sub $64, %edi
316 # else
317 sub $64, %edx
318 # endif
319
320 pmovmskb %xmm0, %eax
321 test %eax, %eax
322 jnz L(matches)
323
324 pmovmskb %xmm2, %eax
325 test %eax, %eax
326 jnz L(matches16)
327
328 # ifndef USE_AS_RAWMEMCHR
329 movdqa 32(%edi), %xmm3
330 # else
331 movdqa 32(%edx), %xmm3
332 # endif
333
334 pcmpeqb %xmm1, %xmm3
335
336 # ifndef USE_AS_RAWMEMCHR
337 pcmpeqb 48(%edi), %xmm1
338 # else
339 pcmpeqb 48(%edx), %xmm1
340 # endif
341 pmovmskb %xmm3, %eax
342 test %eax, %eax
343 jnz L(matches32)
344
345 pmovmskb %xmm1, %eax
346 bsf %eax, %eax
347
348 # ifndef USE_AS_RAWMEMCHR
349 lea 48(%edi, %eax), %eax
350 RETURN
351 # else
352 lea 48(%edx, %eax), %eax
353 ret
354 # endif
355
356 # ifndef USE_AS_RAWMEMCHR
357 .p2align 4
358 L(exit_loop):
359 add $64, %edx
360 cmp $32, %edx
361 jbe L(exit_loop_32)
362
363 movdqa (%edi), %xmm0
364 pcmpeqb %xmm1, %xmm0
365 pmovmskb %xmm0, %eax
366 test %eax, %eax
367 jnz L(matches)
368
369 movdqa 16(%edi), %xmm2
370 pcmpeqb %xmm1, %xmm2
371 pmovmskb %xmm2, %eax
372 test %eax, %eax
373 jnz L(matches16)
374
375 movdqa 32(%edi), %xmm3
376 pcmpeqb %xmm1, %xmm3
377 pmovmskb %xmm3, %eax
378 test %eax, %eax
379 jnz L(matches32_1)
380 cmp $48, %edx
381 jbe L(return_null)
382
383 pcmpeqb 48(%edi), %xmm1
384 pmovmskb %xmm1, %eax
385 test %eax, %eax
386 jnz L(matches48_1)
387 xor %eax, %eax
388 RETURN
389
390 .p2align 4
391 L(exit_loop_32):
392 movdqa (%edi), %xmm0
393 pcmpeqb %xmm1, %xmm0
394 pmovmskb %xmm0, %eax
395 test %eax, %eax
396 jnz L(matches_1)
397 cmp $16, %edx
398 jbe L(return_null)
399
400 pcmpeqb 16(%edi), %xmm1
401 pmovmskb %xmm1, %eax
402 test %eax, %eax
403 jnz L(matches16_1)
404 xor %eax, %eax
405 RETURN
406 # endif
407 .p2align 4
408 L(matches0):
409 bsf %eax, %eax
410 # ifndef USE_AS_RAWMEMCHR
411 lea -16(%eax, %edi), %eax
412 RETURN
413 # else
414 lea -16(%eax, %edx), %eax
415 ret
416 # endif
417
418 .p2align 4
419 L(matches):
420 bsf %eax, %eax
421 # ifndef USE_AS_RAWMEMCHR
422 add %edi, %eax
423 RETURN
424 # else
425 add %edx, %eax
426 ret
427 # endif
428
429 .p2align 4
430 L(matches16):
431 bsf %eax, %eax
432 # ifndef USE_AS_RAWMEMCHR
433 lea 16(%eax, %edi), %eax
434 RETURN
435 # else
436 lea 16(%eax, %edx), %eax
437 ret
438 # endif
439
440 .p2align 4
441 L(matches32):
442 bsf %eax, %eax
443 # ifndef USE_AS_RAWMEMCHR
444 lea 32(%eax, %edi), %eax
445 RETURN
446 # else
447 lea 32(%eax, %edx), %eax
448 ret
449 # endif
450
451 # ifndef USE_AS_RAWMEMCHR
452 .p2align 4
453 L(matches_1):
454 bsf %eax, %eax
455 sub %eax, %edx
456 jbe L(return_null)
457
458 add %edi, %eax
459 RETURN
460
461 .p2align 4
462 L(matches16_1):
463 sub $16, %edx
464 bsf %eax, %eax
465 sub %eax, %edx
466 jbe L(return_null)
467
468 lea 16(%edi, %eax), %eax
469 RETURN
470
471 .p2align 4
472 L(matches32_1):
473 sub $32, %edx
474 bsf %eax, %eax
475 sub %eax, %edx
476 jbe L(return_null)
477
478 lea 32(%edi, %eax), %eax
479 RETURN
480
481 .p2align 4
482 L(matches48_1):
483 sub $48, %edx
484 bsf %eax, %eax
485 sub %eax, %edx
486 jbe L(return_null)
487
488 lea 48(%edi, %eax), %eax
489 RETURN
490 # endif
491 .p2align 4
492 L(return_null):
493 xor %eax, %eax
494 # ifndef USE_AS_RAWMEMCHR
495 RETURN
496 # else
497 ret
498 # endif
499
500 END (MEMCHR)
501 #endif