initial commit
[glibc.git] / sysdeps / powerpc / powerpc64 / le / power10 / strlen.S
1 /* Optimized strlen implementation for POWER10 LE.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* To reuse the code for rawmemchr, we have some extra steps compared to the
22 strlen implementation:
23 - Sum the initial value of r3 with the position at which the char was
24 found, to guarantee we return a pointer and not the length.
25 - In the main loop, subtract each byte by the char we are looking for,
26 so we can keep using vminub to quickly check 64B at once. */
27 #ifdef USE_AS_RAWMEMCHR
28 # ifndef RAWMEMCHR
29 # define FUNCNAME __rawmemchr
30 # else
31 # define FUNCNAME RAWMEMCHR
32 # endif
33 # define MCOUNT_NARGS 2
34 # define VREG_ZERO v20
35 # define OFF_START_LOOP 256
36 # define RAWMEMCHR_SUBTRACT_VECTORS \
37 vsububm v4,v4,v18; \
38 vsububm v5,v5,v18; \
39 vsububm v6,v6,v18; \
40 vsububm v7,v7,v18;
41 # define TAIL(vreg,increment) \
42 vctzlsbb r4,vreg; \
43 addi r4,r4,increment; \
44 add r3,r5,r4; \
45 blr
46
47 #else /* strlen */
48
49 # ifndef STRLEN
50 # define FUNCNAME __strlen
51 # define DEFINE_STRLEN_HIDDEN_DEF 1
52 # else
53 # define FUNCNAME STRLEN
54 # endif
55 # define MCOUNT_NARGS 1
56 # define VREG_ZERO v18
57 # define OFF_START_LOOP 192
58 # define TAIL(vreg,increment) \
59 vctzlsbb r4,vreg; \
60 subf r3,r3,r5; \
61 addi r4,r4,increment; \
62 add r3,r3,r4; \
63 blr
64 #endif /* USE_AS_RAWMEMCHR */
65
66 /* TODO: Replace macros by the actual instructions when minimum binutils becomes
67 >= 2.35. This is used to keep compatibility with older versions. */
68 #define VEXTRACTBM(rt,vrb) \
69 .long(((4)<<(32-6)) \
70 | ((rt)<<(32-11)) \
71 | ((8)<<(32-16)) \
72 | ((vrb)<<(32-21)) \
73 | 1602)
74
75 #define LXVP(xtp,dq,ra) \
76 .long(((6)<<(32-6)) \
77 | ((((xtp)-32)>>1)<<(32-10)) \
78 | ((1)<<(32-11)) \
79 | ((ra)<<(32-16)) \
80 | dq)
81
82 #define CHECK16(vreg,offset,addr,label) \
83 lxv vreg+32,offset(addr); \
84 vcmpequb. vreg,vreg,v18; \
85 bne cr6,L(label);
86
87 /* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
88 of bytes already checked. */
89 #define CHECK64(offset,addr,label) \
90 li r6,offset; \
91 LXVP(v4+32,offset,addr); \
92 LXVP(v6+32,offset+32,addr); \
93 RAWMEMCHR_SUBTRACT_VECTORS; \
94 vminub v14,v4,v5; \
95 vminub v15,v6,v7; \
96 vminub v16,v14,v15; \
97 vcmpequb. v0,v16,VREG_ZERO; \
98 bne cr6,L(label)
99
100 /* Implements the function
101
102 int [r3] strlen (const void *s [r3])
103
104 but when USE_AS_RAWMEMCHR is set, implements the function
105
106 void* [r3] rawmemchr (const void *s [r3], int c [r4])
107
108 The implementation can load bytes past a matching byte, but only
109 up to the next 64B boundary, so it never crosses a page. */
110
111 .machine power9
112
113 ENTRY_TOCLESS (FUNCNAME, 4)
114 CALL_MCOUNT MCOUNT_NARGS
115
116 #ifdef USE_AS_RAWMEMCHR
117 xori r5,r4,0xff
118
119 mtvsrd v18+32,r4 /* matching char in v18 */
120 mtvsrd v19+32,r5 /* non matching char in v19 */
121
122 vspltb v18,v18,7 /* replicate */
123 vspltb v19,v19,7 /* replicate */
124 #else
125 vspltisb v19,-1
126 #endif
127 vspltisb VREG_ZERO,0
128
129 /* Next 16B-aligned address. Prepare address for L(aligned). */
130 addi r5,r3,16
131 clrrdi r5,r5,4
132
133 /* Align data and fill bytes not loaded with non matching char. */
134 lvx v0,0,r3
135 lvsr v1,0,r3
136 vperm v0,v19,v0,v1
137
138 vcmpequb. v6,v0,v18
139 beq cr6,L(aligned)
140
141 #ifdef USE_AS_RAWMEMCHR
142 vctzlsbb r6,v6
143 add r3,r3,r6
144 #else
145 vctzlsbb r3,v6
146 #endif
147 blr
148
149 /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
150 optimized for longer strings, so checking the first bytes in 16B
151 chunks benefits a lot small strings. */
152 .p2align 5
153 L(aligned):
154 #ifdef USE_AS_RAWMEMCHR
155 cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
156 choose how we will perform the main loop. */
157 #endif
158 /* Prepare address for the loop. */
159 addi r4,r3,OFF_START_LOOP
160 clrrdi r4,r4,6
161
162 CHECK16(v0,0,r5,tail1)
163 CHECK16(v1,16,r5,tail2)
164 CHECK16(v2,32,r5,tail3)
165 CHECK16(v3,48,r5,tail4)
166 CHECK16(v4,64,r5,tail5)
167 CHECK16(v5,80,r5,tail6)
168 CHECK16(v6,96,r5,tail7)
169 CHECK16(v7,112,r5,tail8)
170 CHECK16(v8,128,r5,tail9)
171 CHECK16(v9,144,r5,tail10)
172 CHECK16(v10,160,r5,tail11)
173 #ifdef USE_AS_RAWMEMCHR
174 CHECK16(v0,176,r5,tail12)
175 CHECK16(v1,192,r5,tail13)
176 CHECK16(v2,208,r5,tail14)
177 CHECK16(v3,224,r5,tail15)
178 #endif
179
180 addi r5,r4,128
181
182 #ifdef USE_AS_RAWMEMCHR
183 /* If c == 0, use the same loop as strlen, without the vsububm. */
184 beq cr5,L(loop)
185
186 /* This is very similar to the block after L(loop), the difference is
187 that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
188 each byte loaded by the char we are looking for, this way we can keep
189 using vminub to merge the results and checking for nulls. */
190 .p2align 5
191 L(rawmemchr_loop):
192 CHECK64(0,r4,pre_tail_64b)
193 CHECK64(64,r4,pre_tail_64b)
194 addi r4,r4,256
195
196 CHECK64(0,r5,tail_64b)
197 CHECK64(64,r5,tail_64b)
198 addi r5,r5,256
199
200 b L(rawmemchr_loop)
201 #endif
202 /* Switch to a more aggressive approach checking 64B each time. Use 2
203 pointers 128B apart and unroll the loop once to make the pointer
204 updates and usages separated enough to avoid stalls waiting for
205 address calculation. */
206 .p2align 5
207 L(loop):
208 #undef RAWMEMCHR_SUBTRACT_VECTORS
209 #define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
210 CHECK64(0,r4,pre_tail_64b)
211 CHECK64(64,r4,pre_tail_64b)
212 addi r4,r4,256
213
214 CHECK64(0,r5,tail_64b)
215 CHECK64(64,r5,tail_64b)
216 addi r5,r5,256
217
218 b L(loop)
219
220 .p2align 5
221 L(pre_tail_64b):
222 mr r5,r4
223 L(tail_64b):
224 /* OK, we found a null byte. Let's look for it in the current 64-byte
225 block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
226 low 16B bytes into vx+1, and the high into vx, so the order here is
227 v5, v4, v7, v6. */
228 vcmpequb v1,v5,VREG_ZERO
229 vcmpequb v2,v4,VREG_ZERO
230 vcmpequb v3,v7,VREG_ZERO
231 vcmpequb v4,v6,VREG_ZERO
232
233 /* Take into account the other 64B blocks we had already checked. */
234 add r5,r5,r6
235
236 /* Extract first bit of each byte. */
237 VEXTRACTBM(r7,v1)
238 VEXTRACTBM(r8,v2)
239 VEXTRACTBM(r9,v3)
240 VEXTRACTBM(r10,v4)
241
242 /* Shift each value into their corresponding position. */
243 sldi r8,r8,16
244 sldi r9,r9,32
245 sldi r10,r10,48
246
247 /* Merge the results. */
248 or r7,r7,r8
249 or r8,r9,r10
250 or r10,r8,r7
251
252 cnttzd r0,r10 /* Count trailing zeros before the match. */
253 #ifndef USE_AS_RAWMEMCHR
254 subf r5,r3,r5
255 #endif
256 add r3,r5,r0 /* Compute final length. */
257 blr
258
259 .p2align 5
260 L(tail1):
261 TAIL(v0,0)
262
263 .p2align 5
264 L(tail2):
265 TAIL(v1,16)
266
267 .p2align 5
268 L(tail3):
269 TAIL(v2,32)
270
271 .p2align 5
272 L(tail4):
273 TAIL(v3,48)
274
275 .p2align 5
276 L(tail5):
277 TAIL(v4,64)
278
279 .p2align 5
280 L(tail6):
281 TAIL(v5,80)
282
283 .p2align 5
284 L(tail7):
285 TAIL(v6,96)
286
287 .p2align 5
288 L(tail8):
289 TAIL(v7,112)
290
291 .p2align 5
292 L(tail9):
293 TAIL(v8,128)
294
295 .p2align 5
296 L(tail10):
297 TAIL(v9,144)
298
299 .p2align 5
300 L(tail11):
301 TAIL(v10,160)
302
303 #ifdef USE_AS_RAWMEMCHR
304 .p2align 5
305 L(tail12):
306 TAIL(v0,176)
307
308 .p2align 5
309 L(tail13):
310 TAIL(v1,192)
311
312 .p2align 5
313 L(tail14):
314 TAIL(v2,208)
315
316 .p2align 5
317 L(tail15):
318 TAIL(v3,224)
319 #endif
320
321 END (FUNCNAME)
322
323 #ifdef USE_AS_RAWMEMCHR
324 weak_alias (__rawmemchr,rawmemchr)
325 libc_hidden_builtin_def (__rawmemchr)
326 #else
327 # ifdef DEFINE_STRLEN_HIDDEN_DEF
328 weak_alias (__strlen, strlen)
329 libc_hidden_builtin_def (strlen)
330 # endif
331 #endif