initial commit
[glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memset.S
1 /* Optimized memset implementation for PowerPC32/POWER7.
2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
22 Returns 's'. */
23
24 .machine power7
25 EALIGN (memset, 5, 0)
26 CALL_MCOUNT
27
28 .align 4
29 L(_memset):
30 cmplwi cr7,5,31
31 cmplwi cr6,5,8
32 mr 10,3 /* Save original argument for later. */
33 mr 7,1 /* Save original r1 for later. */
34 cfi_offset(31,-8)
35
36 /* Replicate byte to word. */
37 insrwi 4,4,8,16
38 insrwi 4,4,16,0
39
40 ble cr6,L(small) /* If length <= 8, use short copy code. */
41
42 neg 0,3
43 ble cr7,L(medium) /* If length < 32, use medium copy code. */
44
45 /* Save our word twice to create a doubleword that we will later
46 copy to a FPR. */
47 stwu 1,-32(1)
48 andi. 11,10,7 /* Check alignment of DST. */
49 mr 12,5
50 stw 4,24(1)
51 stw 4,28(1)
52 beq L(big_aligned)
53
54 clrlwi 0,0,29
55 mtocrf 0x01,0
56 subf 5,0,5
57
58 /* Get DST aligned to 8 bytes. */
59 1: bf 31,2f
60
61 stb 4,0(10)
62 addi 10,10,1
63 2: bf 30,4f
64
65 sth 4,0(10)
66 addi 10,10,2
67 4: bf 29,L(big_aligned)
68
69 stw 4,0(10)
70 addi 10,10,4
71
72 .align 4
73 L(big_aligned):
74 cmplwi cr5,5,255
75 li 0,32
76 cmplwi cr1,5,160
77 dcbtst 0,10
78 cmplwi cr6,4,0
79 srwi 9,5,3 /* Number of full doublewords remaining. */
80 crand 27,26,21
81 mtocrf 0x01,9
82 bt 27,L(huge)
83
84 /* From this point on, we'll copy 32+ bytes and the value
85 isn't 0 (so we can't use dcbz). */
86
87 srwi 8,5,5
88 clrlwi 11,5,29
89 cmplwi cr6,11,0
90 cmplwi cr1,9,4
91 mtctr 8
92
93 /* Copy 1~3 doublewords so the main loop starts
94 at a multiple of 32 bytes. */
95
96 bf 30,1f
97
98 stw 4,0(10)
99 stw 4,4(10)
100 stw 4,8(10)
101 stw 4,12(10)
102 addi 10,10,16
103 bf 31,L(big_loop)
104
105 stw 4,0(10)
106 stw 4,4(10)
107 addi 10,10,8
108 mr 12,10
109 blt cr1,L(tail_bytes)
110
111 b L(big_loop)
112
113 .align 4
114 1: /* Copy 1 doubleword. */
115 bf 31,L(big_loop)
116
117 stw 4,0(10)
118 stw 4,4(10)
119 addi 10,10,8
120
121 /* First use a 32-bytes loop with stw's to try and avoid the LHS due
122 to the lfd we will do next. Also, ping-pong through r10 and r12
123 to avoid AGEN delays. */
124 .align 4
125 L(big_loop):
126 addi 12,10,32
127 stw 4,0(10)
128 stw 4,4(10)
129 stw 4,8(10)
130 stw 4,12(10)
131 stw 4,16(10)
132 stw 4,20(10)
133 stw 4,24(10)
134 stw 4,28(10)
135 bdz L(tail_bytes)
136
137 addi 10,10,64
138 stw 4,0(12)
139 stw 4,4(12)
140 stw 4,8(12)
141 stw 4,12(12)
142 stw 4,16(12)
143 stw 4,20(12)
144 stw 4,24(12)
145 stw 4,28(12)
146 bdnz L(big_loop_fast_setup)
147
148 mr 12,10
149 b L(tail_bytes)
150
151 /* Now that we're probably past the LHS window, use the VSX to
152 speed up the loop. */
153 L(big_loop_fast_setup):
154 li 11,24
155 li 6,16
156 lxvdsx 4,1,11
157
158 .align 4
159 L(big_loop_fast):
160 addi 12,10,32
161 stxvd2x 4,0,10
162 stxvd2x 4,10,6
163 bdz L(tail_bytes)
164
165 addi 10,10,64
166 stxvd2x 4,0,12
167 stxvd2x 4,12,6
168 bdnz L(big_loop_fast)
169
170 mr 12,10
171
172 .align 4
173 L(tail_bytes):
174
175 /* Check for tail bytes. */
176 mr 1,7 /* Restore r1. */
177 beqlr cr6
178
179 clrlwi 0,5,29
180 mtocrf 0x01,0
181
182 /* At this point we have a tail of 0-7 bytes and we know that the
183 destination is doubleword-aligned. */
184 4: /* Copy 4 bytes. */
185 bf 29,2f
186
187 stw 4,0(12)
188 addi 12,12,4
189 2: /* Copy 2 bytes. */
190 bf 30,1f
191
192 sth 4,0(12)
193 addi 12,12,2
194 1: /* Copy 1 byte. */
195 bflr 31
196
197 stb 4,0(12)
198 blr
199
200
201 /* Special case when value is 0 and we have a long length to deal
202 with. Use dcbz to zero out 128-bytes at a time. Before using
203 dcbz though, we need to get the destination 128-bytes aligned. */
204 .align 4
205 L(huge):
206 lfd 4,24(1)
207 andi. 11,10,127
208 neg 0,10
209 beq L(huge_aligned)
210
211 clrlwi 0,0,25
212 subf 5,0,5
213 srwi 0,0,3
214 mtocrf 0x01,0
215
216 /* Get DST aligned to 128 bytes. */
217 8: bf 28,4f
218
219 stfd 4,0(10)
220 stfd 4,8(10)
221 stfd 4,16(10)
222 stfd 4,24(10)
223 stfd 4,32(10)
224 stfd 4,40(10)
225 stfd 4,48(10)
226 stfd 4,56(10)
227 addi 10,10,64
228 .align 4
229 4: bf 29,2f
230
231 stfd 4,0(10)
232 stfd 4,8(10)
233 stfd 4,16(10)
234 stfd 4,24(10)
235 addi 10,10,32
236 .align 4
237 2: bf 30,1f
238
239 stfd 4,0(10)
240 stfd 4,8(10)
241 addi 10,10,16
242 .align 4
243 1: bf 31,L(huge_aligned)
244
245 stfd 4,0(10)
246 addi 10,10,8
247
248 L(huge_aligned):
249 srwi 8,5,7
250 clrlwi 11,5,25
251 cmplwi cr6,11,0
252 mtctr 8
253
254 /* Copies 128-bytes at a time. */
255 .align 4
256 L(huge_loop):
257 dcbz 0,10
258 addi 10,10,128
259 bdnz L(huge_loop)
260
261 /* We have a tail of 0~127 bytes to handle. */
262 mr 1,7 /* Restore r1. */
263 beqlr cr6
264
265 subf 9,3,10
266 subf 5,9,12
267 srwi 8,5,3
268 cmplwi cr6,8,0
269 mtocrf 0x01,8
270
271 /* We have a tail o 1~127 bytes. Copy up to 15 doublewords for
272 speed. We'll handle the resulting tail bytes later. */
273 beq cr6,L(tail)
274
275 8: bf 28,4f
276
277 stfd 4,0(10)
278 stfd 4,8(10)
279 stfd 4,16(10)
280 stfd 4,24(10)
281 stfd 4,32(10)
282 stfd 4,40(10)
283 stfd 4,48(10)
284 stfd 4,56(10)
285 addi 10,10,64
286 .align 4
287 4: bf 29,2f
288
289 stfd 4,0(10)
290 stfd 4,8(10)
291 stfd 4,16(10)
292 stfd 4,24(10)
293 addi 10,10,32
294 .align 4
295 2: bf 30,1f
296
297 stfd 4,0(10)
298 stfd 4,8(10)
299 addi 10,10,16
300 .align 4
301 1: bf 31,L(tail)
302
303 stfd 4,0(10)
304 addi 10,10,8
305
306 /* Handle the rest of the tail bytes here. */
307 L(tail):
308 mtocrf 0x01,5
309
310 .align 4
311 4: bf 29,2f
312
313 stw 4,0(10)
314 addi 10,10,4
315 .align 4
316 2: bf 30,1f
317
318 sth 4,0(10)
319 addi 10,10,2
320 .align 4
321 1: bflr 31
322
323 stb 4,0(10)
324 blr
325
326
327 /* Expanded tree to copy tail bytes without increments. */
328 .align 4
329 L(copy_tail):
330 bf 29,L(FXX)
331
332 stw 4,0(10)
333 bf 30,L(TFX)
334
335 sth 4,4(10)
336 bflr 31
337
338 stb 4,6(10)
339 blr
340
341 .align 4
342 L(FXX): bf 30,L(FFX)
343
344 sth 4,0(10)
345 bflr 31
346
347 stb 4,2(10)
348 blr
349
350 .align 4
351 L(TFX): bflr 31
352
353 stb 4,4(10)
354 blr
355
356 .align 4
357 L(FFX): bflr 31
358
359 stb 4,0(10)
360 blr
361
362 /* Handle copies of 9~31 bytes. */
363 .align 4
364 L(medium):
365 /* At least 9 bytes to go. */
366 andi. 11,10,3
367 clrlwi 0,0,30
368 beq L(medium_aligned)
369
370 /* Force 4-bytes alignment for DST. */
371 mtocrf 0x01,0
372 subf 5,0,5
373 1: /* Copy 1 byte. */
374 bf 31,2f
375
376 stb 4,0(10)
377 addi 10,10,1
378 2: /* Copy 2 bytes. */
379 bf 30,L(medium_aligned)
380
381 sth 4,0(10)
382 addi 10,10,2
383
384 .align 4
385 L(medium_aligned):
386 /* At least 6 bytes to go, and DST is word-aligned. */
387 cmplwi cr1,5,16
388 mtocrf 0x01,5
389 blt cr1,8f
390
391 /* Copy 16 bytes. */
392 stw 4,0(10)
393 stw 4,4(10)
394 stw 4,8(10)
395 stw 4,12(10)
396 addi 10,10,16
397 8: /* Copy 8 bytes. */
398 bf 28,4f
399
400 stw 4,0(10)
401 stw 4,4(10)
402 addi 10,10,8
403 4: /* Copy 4 bytes. */
404 bf 29,2f
405
406 stw 4,0(10)
407 addi 10,10,4
408 2: /* Copy 2-3 bytes. */
409 bf 30,1f
410
411 sth 4,0(10)
412 addi 10,10,2
413 1: /* Copy 1 byte. */
414 bflr 31
415
416 stb 4,0(10)
417 blr
418
419 /* Handles copies of 0~8 bytes. */
420 .align 4
421 L(small):
422 mtocrf 0x01,5
423 bne cr6,L(copy_tail)
424
425 stw 4,0(10)
426 stw 4,4(10)
427 blr
428
429 END (memset)
430 libc_hidden_builtin_def (memset)