initial commit
[glibc.git] / sysdeps / aarch64 / memchr.S
1 /* memchr - find a character in a memory zone
2
3 Copyright (C) 2015-2022 Free Software Foundation, Inc.
4
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library. If not, see
19 <https://www.gnu.org/licenses/>. */
20
21 #include <sysdep.h>
22
23 /* Assumptions:
24 *
25 * ARMv8-a, AArch64, Advanced SIMD.
26 * MTE compatible.
27 */
28
29 #ifndef MEMCHR
30 # define MEMCHR __memchr
31 #endif
32
33 /* Arguments and results. */
34 #define srcin x0
35 #define chrin w1
36 #define cntin x2
37 #define result x0
38
39 #define src x3
40 #define cntrem x4
41 #define synd x5
42 #define shift x6
43 #define tmp x7
44
45 #define vrepchr v0
46 #define qdata q1
47 #define vdata v1
48 #define vhas_chr v2
49 #define vend v3
50 #define dend d3
51
52 /*
53 Core algorithm:
54 For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
55 per byte. We take 4 bits of every comparison byte with shift right and narrow
56 by 4 instruction. Since the bits in the nibble mask reflect the order in
57 which things occur in the original string, counting leading zeros identifies
58 exactly which byte matched. */
59
60 ENTRY (MEMCHR)
61 PTR_ARG (0)
62 SIZE_ARG (2)
63 bic src, srcin, 15
64 cbz cntin, L(nomatch)
65 ld1 {vdata.16b}, [src]
66 dup vrepchr.16b, chrin
67 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
68 lsl shift, srcin, 2
69 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
70 fmov synd, dend
71 lsr synd, synd, shift
72 cbz synd, L(start_loop)
73
74 rbit synd, synd
75 clz synd, synd
76 add result, srcin, synd, lsr 2
77 cmp cntin, synd, lsr 2
78 csel result, result, xzr, hi
79 ret
80
81 L(start_loop):
82 sub tmp, src, srcin
83 add tmp, tmp, 16
84 subs cntrem, cntin, tmp
85 b.ls L(nomatch)
86
87 /* Make sure that it won't overread by a 16-byte chunk */
88 add tmp, cntrem, 15
89 tbnz tmp, 4, L(loop32_2)
90
91 .p2align 4
92 L(loop32):
93 ldr qdata, [src, 16]!
94 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
95 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
96 fmov synd, dend
97 cbnz synd, L(end)
98
99 L(loop32_2):
100 ldr qdata, [src, 16]!
101 subs cntrem, cntrem, 32
102 cmeq vhas_chr.16b, vdata.16b, vrepchr.16b
103 b.ls L(end)
104 umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
105 fmov synd, dend
106 cbz synd, L(loop32)
107 L(end):
108 shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
109 fmov synd, dend
110 add tmp, srcin, cntin
111 sub cntrem, tmp, src
112 #ifndef __AARCH64EB__
113 rbit synd, synd
114 #endif
115 clz synd, synd
116 cmp cntrem, synd, lsr 2
117 add result, src, synd, lsr 2
118 csel result, result, xzr, hi
119 ret
120
121 L(nomatch):
122 mov result, 0
123 ret
124
125 END (MEMCHR)
126 weak_alias (MEMCHR, memchr)
127 libc_hidden_builtin_def (memchr)