initial commit
[glibc.git] / sysdeps / powerpc / powerpc64 / le / power10 / memmove.S
1 /* Optimized memmove implementation for POWER10.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19 #include <sysdep.h>
20
21
22 /* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
23
24 This optimization checks if 'src' and 'dst' overlap. If they do not
25 or 'src' is ahead of 'dest' then it copies forward.
26 Otherwise, an optimized backward copy is used. */
27
28 #ifndef MEMMOVE
29 # define MEMMOVE memmove
30 #endif
31 .machine power9
32 ENTRY_TOCLESS (MEMMOVE, 5)
33 CALL_MCOUNT 3
34
35 L(_memmove):
36 .p2align 5
37 /* Check if there is overlap, if so it will branch to backward copy. */
38 subf r9,r4,r3
39 cmpld cr7,r9,r5
40 blt cr7,L(memmove_bwd)
41
42 /* Fast path for length shorter than 16 bytes. */
43 sldi r7,r5,56
44 lxvl 32+v2,r4,r7
45 stxvl 32+v2,r3,r7
46 subic. r8,r5,16
47 blelr
48
49 /* For shorter lengths aligning the dest address to 16 bytes either
50 decreases performance or is irrelevant. I'm making use of this
51 comparison to skip the alignment in. */
52 cmpldi cr6,r5,256
53 bge cr6,L(ge_256)
54 /* Account for the first 16-byte copy. */
55 addi r4,r4,16
56 addi r11,r3,16 /* use r11 to keep dest address on r3. */
57 subi r5,r5,16
58 b L(loop_head)
59
60 .p2align 5
61 L(ge_256):
62 /* Account for the first copy <= 16 bytes. This is necessary for
63 memmove because at this point the src address can be in front of the
64 dest address. */
65 clrldi r9,r5,56
66 li r8,16
67 cmpldi r9,16
68 iselgt r9,r8,r9
69 add r4,r4,r9
70 add r11,r3,r9 /* use r11 to keep dest address on r3. */
71 sub r5,r5,r9
72
73 /* Align dest to 16 bytes. */
74 neg r7,r3
75 clrldi. r9,r7,60
76 beq L(loop_head)
77
78 .p2align 5
79 sldi r6,r9,56
80 lxvl 32+v0,r4,r6
81 stxvl 32+v0,r11,r6
82 sub r5,r5,r9
83 add r4,r4,r9
84 add r11,r11,r9
85
86 L(loop_head):
87 cmpldi r5,63
88 ble L(final_64)
89
90 srdi. r7,r5,7
91 beq L(loop_tail)
92
93 mtctr r7
94
95 /* Main loop that copies 128 bytes each iteration. */
96 .p2align 5
97 L(loop):
98 addi r9,r4,64
99 addi r10,r11,64
100
101 lxv 32+v0,0(r4)
102 lxv 32+v1,16(r4)
103 lxv 32+v2,32(r4)
104 lxv 32+v3,48(r4)
105
106 stxv 32+v0,0(r11)
107 stxv 32+v1,16(r11)
108 stxv 32+v2,32(r11)
109 stxv 32+v3,48(r11)
110
111 addi r4,r4,128
112 addi r11,r11,128
113
114 lxv 32+v4,0(r9)
115 lxv 32+v5,16(r9)
116 lxv 32+v6,32(r9)
117 lxv 32+v7,48(r9)
118
119 stxv 32+v4,0(r10)
120 stxv 32+v5,16(r10)
121 stxv 32+v6,32(r10)
122 stxv 32+v7,48(r10)
123
124 bdnz L(loop)
125 clrldi. r5,r5,57
126 beqlr
127
128 /* Copy 64 bytes. */
129 .p2align 5
130 L(loop_tail):
131 cmpldi cr5,r5,63
132 ble cr5,L(final_64)
133
134 lxv 32+v0,0(r4)
135 lxv 32+v1,16(r4)
136 lxv 32+v2,32(r4)
137 lxv 32+v3,48(r4)
138
139 stxv 32+v0,0(r11)
140 stxv 32+v1,16(r11)
141 stxv 32+v2,32(r11)
142 stxv 32+v3,48(r11)
143
144 addi r4,r4,64
145 addi r11,r11,64
146 subi r5,r5,64
147
148 /* Copies the last 1-63 bytes. */
149 .p2align 5
150 L(final_64):
151 /* r8 holds the number of bytes that will be copied with lxv/stxv. */
152 clrrdi. r8,r5,4
153 beq L(tail1)
154
155 cmpldi cr5,r5,32
156 lxv 32+v0,0(r4)
157 blt cr5,L(tail2)
158
159 cmpldi cr6,r5,48
160 lxv 32+v1,16(r4)
161 blt cr6,L(tail3)
162
163 .p2align 5
164 lxv 32+v2,32(r4)
165 stxv 32+v2,32(r11)
166 L(tail3):
167 stxv 32+v1,16(r11)
168 L(tail2):
169 stxv 32+v0,0(r11)
170 sub r5,r5,r8
171 add r4,r4,r8
172 add r11,r11,r8
173 .p2align 5
174 L(tail1):
175 sldi r6,r5,56
176 lxvl v4,r4,r6
177 stxvl v4,r11,r6
178 blr
179
180 /* If dest and src overlap, we should copy backwards. */
181 L(memmove_bwd):
182 add r11,r3,r5
183 add r4,r4,r5
184
185 /* Optimization for length smaller than 16 bytes. */
186 cmpldi cr5,r5,15
187 ble cr5,L(tail1_bwd)
188
189 /* For shorter lengths the alignment either slows down or is irrelevant.
190 The forward copy uses a already need 256 comparison for that. Here
191 it's using 128 as it will reduce code and improve readability. */
192 cmpldi cr7,r5,128
193 blt cr7,L(bwd_loop_tail)
194
195 /* Align dest address to 16 bytes. */
196 .p2align 5
197 clrldi. r9,r11,60
198 beq L(bwd_loop_head)
199 sub r4,r4,r9
200 sub r11,r11,r9
201 lxv 32+v0,0(r4)
202 sldi r6,r9,56
203 stxvl 32+v0,r11,r6
204 sub r5,r5,r9
205
206 L(bwd_loop_head):
207 srdi. r7,r5,7
208 beq L(bwd_loop_tail)
209
210 mtctr r7
211
212 /* Main loop that copies 128 bytes every iteration. */
213 .p2align 5
214 L(bwd_loop):
215 addi r9,r4,-64
216 addi r10,r11,-64
217
218 lxv 32+v0,-16(r4)
219 lxv 32+v1,-32(r4)
220 lxv 32+v2,-48(r4)
221 lxv 32+v3,-64(r4)
222
223 stxv 32+v0,-16(r11)
224 stxv 32+v1,-32(r11)
225 stxv 32+v2,-48(r11)
226 stxv 32+v3,-64(r11)
227
228 addi r4,r4,-128
229 addi r11,r11,-128
230
231 lxv 32+v0,-16(r9)
232 lxv 32+v1,-32(r9)
233 lxv 32+v2,-48(r9)
234 lxv 32+v3,-64(r9)
235
236 stxv 32+v0,-16(r10)
237 stxv 32+v1,-32(r10)
238 stxv 32+v2,-48(r10)
239 stxv 32+v3,-64(r10)
240
241 bdnz L(bwd_loop)
242 clrldi. r5,r5,57
243 beqlr
244
245 /* Copy 64 bytes. */
246 .p2align 5
247 L(bwd_loop_tail):
248 cmpldi cr5,r5,63
249 ble cr5,L(bwd_final_64)
250
251 addi r4,r4,-64
252 addi r11,r11,-64
253
254 lxv 32+v0,0(r4)
255 lxv 32+v1,16(r4)
256 lxv 32+v2,32(r4)
257 lxv 32+v3,48(r4)
258
259 stxv 32+v0,0(r11)
260 stxv 32+v1,16(r11)
261 stxv 32+v2,32(r11)
262 stxv 32+v3,48(r11)
263
264 subi r5,r5,64
265
266 /* Copies the last 1-63 bytes. */
267 .p2align 5
268 L(bwd_final_64):
269 /* r8 holds the number of bytes that will be copied with lxv/stxv. */
270 clrrdi. r8,r5,4
271 beq L(tail1_bwd)
272
273 cmpldi cr5,r5,32
274 lxv 32+v2,-16(r4)
275 blt cr5,L(tail2_bwd)
276
277 cmpldi cr6,r5,48
278 lxv 32+v1,-32(r4)
279 blt cr6,L(tail3_bwd)
280
281 .p2align 5
282 lxv 32+v0,-48(r4)
283 stxv 32+v0,-48(r11)
284 L(tail3_bwd):
285 stxv 32+v1,-32(r11)
286 L(tail2_bwd):
287 stxv 32+v2,-16(r11)
288 sub r4,r4,r5
289 sub r11,r11,r5
290 sub r5,r5,r8
291 sldi r6,r5,56
292 lxvl v4,r4,r6
293 stxvl v4,r11,r6
294 blr
295
296 /* Copy last 16 bytes. */
297 .p2align 5
298 L(tail1_bwd):
299 sub r4,r4,r5
300 sub r11,r11,r5
301 sldi r6,r5,56
302 lxvl v4,r4,r6
303 stxvl v4,r11,r6
304 blr
305
306 END_GEN_TB (MEMMOVE,TB_TOCLESS)
307 libc_hidden_builtin_def (memmove)