initial commit
[glibc.git] / sysdeps / powerpc / powerpc32 / power7 / memcpy.S
1 /* Optimized memcpy implementation for PowerPC32/POWER7.
2 Copyright (C) 2010-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21 /* void * [r3] memcpy (void *dst [r3], void *src [r4], size_t len [r5]);
22 Returns 'dst'. */
23
24 .machine power7
25 EALIGN (memcpy, 5, 0)
26 CALL_MCOUNT
27
28 stwu 1,-32(1)
29 cfi_adjust_cfa_offset(32)
30 stw 30,20(1)
31 cfi_offset(30,(20-32))
32 stw 31,24(1)
33 mr 30,3
34 cmplwi cr1,5,31
35 neg 0,3
36 cfi_offset(31,-8)
37 ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
38 code. */
39
40 andi. 11,3,15 /* Check alignment of DST. */
41 clrlwi 10,4,28 /* Check alignment of SRC. */
42 cmplw cr6,10,11 /* SRC and DST alignments match? */
43 mr 12,4
44 mr 31,5
45 bne cr6,L(copy_GE_32_unaligned)
46
47 srwi 9,5,3 /* Number of full quadwords remaining. */
48
49 beq L(copy_GE_32_aligned_cont)
50
51 clrlwi 0,0,29
52 mtcrf 0x01,0
53 subf 31,0,5
54
55 /* Get the SRC aligned to 8 bytes. */
56
57 1: bf 31,2f
58 lbz 6,0(12)
59 addi 12,12,1
60 stb 6,0(3)
61 addi 3,3,1
62 2: bf 30,4f
63 lhz 6,0(12)
64 addi 12,12,2
65 sth 6,0(3)
66 addi 3,3,2
67 4: bf 29,0f
68 lwz 6,0(12)
69 addi 12,12,4
70 stw 6,0(3)
71 addi 3,3,4
72 0:
73 clrlwi 10,12,29 /* Check alignment of SRC again. */
74 srwi 9,31,3 /* Number of full doublewords remaining. */
75
76 L(copy_GE_32_aligned_cont):
77
78 clrlwi 11,31,29
79 mtcrf 0x01,9
80
81 srwi 8,31,5
82 cmplwi cr1,9,4
83 cmplwi cr6,11,0
84 mr 11,12
85
86 /* Copy 1~3 doublewords so the main loop starts
87 at a multiple of 32 bytes. */
88
89 bf 30,1f
90 lfd 6,0(12)
91 lfd 7,8(12)
92 addi 11,12,16
93 mtctr 8
94 stfd 6,0(3)
95 stfd 7,8(3)
96 addi 10,3,16
97 bf 31,4f
98 lfd 0,16(12)
99 stfd 0,16(3)
100 blt cr1,3f
101 addi 11,12,24
102 addi 10,3,24
103 b 4f
104
105 .align 4
106 1: /* Copy 1 doubleword and set the counter. */
107 mr 10,3
108 mtctr 8
109 bf 31,4f
110 lfd 6,0(12)
111 addi 11,12,8
112 stfd 6,0(3)
113 addi 10,3,8
114
115 L(aligned_copy):
116 /* Main aligned copy loop. Copies up to 128-bytes at a time. */
117 .align 4
118 4:
119 /* check for any 32-byte or 64-byte lumps that are outside of a
120 nice 128-byte range. R8 contains the number of 32-byte
121 lumps, so drop this into the CR, and use the SO/EQ bits to help
122 handle the 32- or 64- byte lumps. Then handle the rest with an
123 unrolled 128-bytes-at-a-time copy loop. */
124 mtocrf 1,8
125 li 6,16 # 16() index
126 li 7,32 # 32() index
127 li 8,48 # 48() index
128
129 L(aligned_32byte):
130 /* if the SO bit (indicating a 32-byte lump) is not set, move along. */
131 bns cr7,L(aligned_64byte)
132 lxvd2x 6,0,11
133 lxvd2x 7,11,6
134 addi 11,11,32
135 stxvd2x 6,0,10
136 stxvd2x 7,10,6
137 addi 10,10,32
138
139 L(aligned_64byte):
140 /* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
141 bne cr7,L(aligned_128setup)
142 lxvd2x 6,0,11
143 lxvd2x 7,11,6
144 lxvd2x 8,11,7
145 lxvd2x 9,11,8
146 addi 11,11,64
147 stxvd2x 6,0,10
148 stxvd2x 7,10,6
149 stxvd2x 8,10,7
150 stxvd2x 9,10,8
151 addi 10,10,64
152
153 L(aligned_128setup):
154 /* Set up for the 128-byte at a time copy loop. */
155 srwi 8,31,7
156 cmpwi 8,0 # Any 4x lumps left?
157 beq 3f # if not, move along.
158 lxvd2x 6,0,11
159 lxvd2x 7,11,6
160 mtctr 8 # otherwise, load the ctr and begin.
161 li 8,48 # 48() index
162 b L(aligned_128loop)
163
164 L(aligned_128head):
165 /* for the 2nd + iteration of this loop. */
166 lxvd2x 6,0,11
167 lxvd2x 7,11,6
168 L(aligned_128loop):
169 lxvd2x 8,11,7
170 lxvd2x 9,11,8
171 stxvd2x 6,0,10
172 addi 11,11,64
173 stxvd2x 7,10,6
174 stxvd2x 8,10,7
175 stxvd2x 9,10,8
176 lxvd2x 6,0,11
177 lxvd2x 7,11,6
178 addi 10,10,64
179 lxvd2x 8,11,7
180 lxvd2x 9,11,8
181 addi 11,11,64
182 stxvd2x 6,0,10
183 stxvd2x 7,10,6
184 stxvd2x 8,10,7
185 stxvd2x 9,10,8
186 addi 10,10,64
187 bdnz L(aligned_128head)
188
189 3:
190 /* Check for tail bytes. */
191 clrrwi 0,31,3
192 mtcrf 0x01,31
193 beq cr6,0f
194
195 .L9:
196 add 3,3,0
197 add 12,12,0
198
199 /* At this point we have a tail of 0-7 bytes and we know that the
200 destination is doubleword-aligned. */
201 4: /* Copy 4 bytes. */
202 bf 29,2f
203
204 lwz 6,0(12)
205 addi 12,12,4
206 stw 6,0(3)
207 addi 3,3,4
208 2: /* Copy 2 bytes. */
209 bf 30,1f
210
211 lhz 6,0(12)
212 addi 12,12,2
213 sth 6,0(3)
214 addi 3,3,2
215 1: /* Copy 1 byte. */
216 bf 31,0f
217
218 lbz 6,0(12)
219 stb 6,0(3)
220 0: /* Return original DST pointer. */
221 mr 3,30
222 lwz 30,20(1)
223 lwz 31,24(1)
224 addi 1,1,32
225 blr
226
227 /* Handle copies of 0~31 bytes. */
228 .align 4
229 L(copy_LT_32):
230 cmplwi cr6,5,8
231 mr 12,4
232 mtcrf 0x01,5
233 ble cr6,L(copy_LE_8)
234
235 /* At least 9 bytes to go. */
236 neg 8,4
237 clrrwi 11,4,2
238 andi. 0,8,3
239 cmplwi cr1,5,16
240 mr 10,5
241 beq L(copy_LT_32_aligned)
242
243 /* Force 4-bytes alignment for SRC. */
244 mtocrf 0x01,0
245 subf 10,0,5
246 2: bf 30,1f
247
248 lhz 6,0(12)
249 addi 12,12,2
250 sth 6,0(3)
251 addi 3,3,2
252 1: bf 31,L(end_4bytes_alignment)
253
254 lbz 6,0(12)
255 addi 12,12,1
256 stb 6,0(3)
257 addi 3,3,1
258
259 .align 4
260 L(end_4bytes_alignment):
261 cmplwi cr1,10,16
262 mtcrf 0x01,10
263
264 L(copy_LT_32_aligned):
265 /* At least 6 bytes to go, and SRC is word-aligned. */
266 blt cr1,8f
267
268 /* Copy 16 bytes. */
269 lwz 6,0(12)
270 lwz 7,4(12)
271 stw 6,0(3)
272 lwz 8,8(12)
273 stw 7,4(3)
274 lwz 6,12(12)
275 addi 12,12,16
276 stw 8,8(3)
277 stw 6,12(3)
278 addi 3,3,16
279 8: /* Copy 8 bytes. */
280 bf 28,4f
281
282 lwz 6,0(12)
283 lwz 7,4(12)
284 addi 12,12,8
285 stw 6,0(3)
286 stw 7,4(3)
287 addi 3,3,8
288 4: /* Copy 4 bytes. */
289 bf 29,2f
290
291 lwz 6,0(12)
292 addi 12,12,4
293 stw 6,0(3)
294 addi 3,3,4
295 2: /* Copy 2-3 bytes. */
296 bf 30,1f
297
298 lhz 6,0(12)
299 sth 6,0(3)
300 bf 31,0f
301 lbz 7,2(12)
302 stb 7,2(3)
303
304 /* Return original DST pointer. */
305 mr 3,30
306 lwz 30,20(1)
307 addi 1,1,32
308 blr
309
310 .align 4
311 1: /* Copy 1 byte. */
312 bf 31,0f
313
314 lbz 6,0(12)
315 stb 6,0(3)
316 0: /* Return original DST pointer. */
317 mr 3,30
318 lwz 30,20(1)
319 addi 1,1,32
320 blr
321
322 /* Handles copies of 0~8 bytes. */
323 .align 4
324 L(copy_LE_8):
325 bne cr6,4f
326
327 /* Though we could've used lfd/stfd here, they are still
328 slow for unaligned cases. */
329
330 lwz 6,0(4)
331 lwz 7,4(4)
332 stw 6,0(3)
333 stw 7,4(3)
334
335 /* Return original DST pointer. */
336 mr 3,30
337 lwz 30,20(1)
338 addi 1,1,32
339 blr
340
341 .align 4
342 4: /* Copies 4~7 bytes. */
343 bf 29,2b
344
345 lwz 6,0(4)
346 stw 6,0(3)
347 bf 30,5f
348 lhz 7,4(4)
349 sth 7,4(3)
350 bf 31,0f
351 lbz 8,6(4)
352 stb 8,6(3)
353
354 /* Return original DST pointer. */
355 mr 3,30
356 lwz 30,20(1)
357 addi 1,1,32
358 blr
359
360 .align 4
361 5: /* Copy 1 byte. */
362 bf 31,0f
363
364 lbz 6,4(4)
365 stb 6,4(3)
366
367 0: /* Return original DST pointer. */
368 mr 3,30
369 lwz 30,20(1)
370 addi 1,1,32
371 blr
372
373 /* Handle copies of 32+ bytes where DST is aligned (to quadword) but
374 SRC is not. Use aligned quadword loads from SRC, shifted to realign
375 the data, allowing for aligned DST stores. */
376 .align 4
377 L(copy_GE_32_unaligned):
378 andi. 11,3,15 /* Check alignment of DST. */
379 clrlwi 0,0,28 /* Number of bytes until the 1st
380 quadword of DST. */
381 srwi 9,5,4 /* Number of full quadwords remaining. */
382
383 beq L(copy_GE_32_unaligned_cont)
384
385 /* DST is not quadword aligned, get it aligned. */
386
387 mtcrf 0x01,0
388 subf 31,0,5
389
390 /* Vector instructions work best when proper alignment (16-bytes)
391 is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
392 1: /* Copy 1 byte. */
393 bf 31,2f
394
395 lbz 6,0(12)
396 addi 12,12,1
397 stb 6,0(3)
398 addi 3,3,1
399 2: /* Copy 2 bytes. */
400 bf 30,4f
401
402 lhz 6,0(12)
403 addi 12,12,2
404 sth 6,0(3)
405 addi 3,3,2
406 4: /* Copy 4 bytes. */
407 bf 29,8f
408
409 lwz 6,0(12)
410 addi 12,12,4
411 stw 6,0(3)
412 addi 3,3,4
413 8: /* Copy 8 bytes. */
414 bf 28,0f
415
416 lfd 6,0(12)
417 addi 12,12,8
418 stfd 6,0(3)
419 addi 3,3,8
420 0:
421 clrlwi 10,12,28 /* Check alignment of SRC. */
422 srwi 9,31,4 /* Number of full quadwords remaining. */
423
424 /* The proper alignment is present, it is OK to copy the bytes now. */
425 L(copy_GE_32_unaligned_cont):
426
427 /* Setup two indexes to speed up the indexed vector operations. */
428 clrlwi 11,31,28
429 li 6,16 /* Index for 16-bytes offsets. */
430 li 7,32 /* Index for 32-bytes offsets. */
431 cmplwi cr1,11,0
432 srwi 8,31,5 /* Setup the loop counter. */
433 mr 10,3
434 mr 11,12
435 mtcrf 0x01,9
436 cmplwi cr6,9,1
437 #ifdef __LITTLE_ENDIAN__
438 lvsr 5,0,12
439 #else
440 lvsl 5,0,12
441 #endif
442 lvx 3,0,12
443 bf 31,L(setup_unaligned_loop)
444
445 /* Copy another 16 bytes to align to 32-bytes due to the loop . */
446 lvx 4,12,6
447 #ifdef __LITTLE_ENDIAN__
448 vperm 6,4,3,5
449 #else
450 vperm 6,3,4,5
451 #endif
452 addi 11,12,16
453 addi 10,3,16
454 stvx 6,0,3
455 vor 3,4,4
456
457 L(setup_unaligned_loop):
458 mtctr 8
459 ble cr6,L(end_unaligned_loop)
460
461 /* Copy 32 bytes at a time using vector instructions. */
462 .align 4
463 L(unaligned_loop):
464
465 /* Note: vr6/vr10 may contain data that was already copied,
466 but in order to get proper alignment, we may have to copy
467 some portions again. This is faster than having unaligned
468 vector instructions though. */
469
470 lvx 4,11,6 /* vr4 = r11+16. */
471 #ifdef __LITTLE_ENDIAN__
472 vperm 6,4,3,5
473 #else
474 vperm 6,3,4,5
475 #endif
476 lvx 3,11,7 /* vr3 = r11+32. */
477 #ifdef __LITTLE_ENDIAN__
478 vperm 10,3,4,5
479 #else
480 vperm 10,4,3,5
481 #endif
482 addi 11,11,32
483 stvx 6,0,10
484 stvx 10,10,6
485 addi 10,10,32
486
487 bdnz L(unaligned_loop)
488
489 .align 4
490 L(end_unaligned_loop):
491
492 /* Check for tail bytes. */
493 clrrwi 0,31,4
494 mtcrf 0x01,31
495 beq cr1,0f
496
497 add 3,3,0
498 add 12,12,0
499
500 /* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
501 8: /* Copy 8 bytes. */
502 bf 28,4f
503
504 lwz 6,0(12)
505 lwz 7,4(12)
506 addi 12,12,8
507 stw 6,0(3)
508 stw 7,4(3)
509 addi 3,3,8
510 4: /* Copy 4 bytes. */
511 bf 29,2f
512
513 lwz 6,0(12)
514 addi 12,12,4
515 stw 6,0(3)
516 addi 3,3,4
517 2: /* Copy 2~3 bytes. */
518 bf 30,1f
519
520 lhz 6,0(12)
521 addi 12,12,2
522 sth 6,0(3)
523 addi 3,3,2
524 1: /* Copy 1 byte. */
525 bf 31,0f
526
527 lbz 6,0(12)
528 stb 6,0(3)
529 0: /* Return original DST pointer. */
530 mr 3,30
531 lwz 30,20(1)
532 lwz 31,24(1)
533 addi 1,1,32
534 blr
535
536 END (memcpy)
537 libc_hidden_builtin_def (memcpy)