initial commit
[glibc.git] / sysdeps / aarch64 / multiarch / memcpy_sve.S
1 /* Optimized memcpy for SVE.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library. If not, see
18 <https://www.gnu.org/licenses/>. */
19
20 #include <sysdep.h>
21
22 /* Assumptions:
23 *
24 * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
25 *
26 */
27
28 #define dstin x0
29 #define src x1
30 #define count x2
31 #define dst x3
32 #define srcend x4
33 #define dstend x5
34 #define tmp1 x6
35 #define vlen x6
36
37 #define A_q q0
38 #define B_q q1
39 #define C_q q2
40 #define D_q q3
41 #define E_q q4
42 #define F_q q5
43 #define G_q q6
44 #define H_q q7
45
46 /* This implementation supports both memcpy and memmove and shares most code.
47 It uses unaligned accesses and branchless sequences to keep the code small,
48 simple and improve performance.
49
50 Copies are split into 3 main cases: small copies of up to 32 bytes, medium
51 copies of up to 128 bytes, and large copies. The overhead of the overlap
52 check in memmove is negligible since it is only required for large copies.
53
54 Large copies use a software pipelined loop processing 64 bytes per iteration.
55 The source pointer is 16-byte aligned to minimize unaligned accesses.
56 The loop tail is handled by always copying 64 bytes from the end.
57 */
58
59 #if HAVE_AARCH64_SVE_ASM
60
61 .arch armv8.2-a+sve
62
63 ENTRY (__memcpy_sve)
64 PTR_ARG (0)
65 PTR_ARG (1)
66 SIZE_ARG (2)
67
68 cmp count, 128
69 b.hi L(copy_long)
70 cmp count, 32
71 b.hi L(copy32_128)
72
73 whilelo p0.b, xzr, count
74 cntb vlen
75 tbnz vlen, 4, L(vlen128)
76 ld1b z0.b, p0/z, [src]
77 st1b z0.b, p0, [dstin]
78 ret
79
80 /* Medium copies: 33..128 bytes. */
81 L(copy32_128):
82 add srcend, src, count
83 add dstend, dstin, count
84 ldp A_q, B_q, [src]
85 ldp C_q, D_q, [srcend, -32]
86 cmp count, 64
87 b.hi L(copy128)
88 stp A_q, B_q, [dstin]
89 stp C_q, D_q, [dstend, -32]
90 ret
91
92 /* Copy 65..128 bytes. */
93 L(copy128):
94 ldp E_q, F_q, [src, 32]
95 cmp count, 96
96 b.ls L(copy96)
97 ldp G_q, H_q, [srcend, -64]
98 stp G_q, H_q, [dstend, -64]
99 L(copy96):
100 stp A_q, B_q, [dstin]
101 stp E_q, F_q, [dstin, 32]
102 stp C_q, D_q, [dstend, -32]
103 ret
104
105 L(vlen128):
106 whilelo p1.b, vlen, count
107 ld1b z0.b, p0/z, [src, 0, mul vl]
108 ld1b z1.b, p1/z, [src, 1, mul vl]
109 st1b z0.b, p0, [dstin, 0, mul vl]
110 st1b z1.b, p1, [dstin, 1, mul vl]
111 ret
112
113 .p2align 4
114 /* Copy more than 128 bytes. */
115 L(copy_long):
116 add srcend, src, count
117 add dstend, dstin, count
118
119 /* Copy 16 bytes and then align src to 16-byte alignment. */
120 ldr D_q, [src]
121 and tmp1, src, 15
122 bic src, src, 15
123 sub dst, dstin, tmp1
124 add count, count, tmp1 /* Count is now 16 too large. */
125 ldp A_q, B_q, [src, 16]
126 str D_q, [dstin]
127 ldp C_q, D_q, [src, 48]
128 subs count, count, 128 + 16 /* Test and readjust count. */
129 b.ls L(copy64_from_end)
130 L(loop64):
131 stp A_q, B_q, [dst, 16]
132 ldp A_q, B_q, [src, 80]
133 stp C_q, D_q, [dst, 48]
134 ldp C_q, D_q, [src, 112]
135 add src, src, 64
136 add dst, dst, 64
137 subs count, count, 64
138 b.hi L(loop64)
139
140 /* Write the last iteration and copy 64 bytes from the end. */
141 L(copy64_from_end):
142 ldp E_q, F_q, [srcend, -64]
143 stp A_q, B_q, [dst, 16]
144 ldp A_q, B_q, [srcend, -32]
145 stp C_q, D_q, [dst, 48]
146 stp E_q, F_q, [dstend, -64]
147 stp A_q, B_q, [dstend, -32]
148 ret
149
150 END (__memcpy_sve)
151 libc_hidden_builtin_def (__memcpy_sve)
152
153
154 ENTRY (__memmove_sve)
155 PTR_ARG (0)
156 PTR_ARG (1)
157 SIZE_ARG (2)
158
159 cmp count, 128
160 b.hi L(move_long)
161 cmp count, 32
162 b.hi L(copy32_128)
163
164 whilelo p0.b, xzr, count
165 cntb vlen
166 tbnz vlen, 4, L(vlen128)
167 ld1b z0.b, p0/z, [src]
168 st1b z0.b, p0, [dstin]
169 ret
170
171 .p2align 4
172 L(move_long):
173 add srcend, src, count
174 add dstend, dstin, count
175 /* Only use backward copy if there is an overlap. */
176 sub tmp1, dstin, src
177 cbz tmp1, L(return)
178 cmp tmp1, count
179 b.hs L(copy_long)
180
181 /* Large backwards copy for overlapping copies.
182 Copy 16 bytes and then align srcend to 16-byte alignment. */
183 ldr D_q, [srcend, -16]
184 and tmp1, srcend, 15
185 bic srcend, srcend, 15
186 sub count, count, tmp1
187 ldp A_q, B_q, [srcend, -32]
188 str D_q, [dstend, -16]
189 ldp C_q, D_q, [srcend, -64]
190 sub dstend, dstend, tmp1
191 subs count, count, 128
192 b.ls L(copy64_from_start)
193
194 L(loop64_backwards):
195 str B_q, [dstend, -16]
196 str A_q, [dstend, -32]
197 ldp A_q, B_q, [srcend, -96]
198 str D_q, [dstend, -48]
199 str C_q, [dstend, -64]!
200 ldp C_q, D_q, [srcend, -128]
201 sub srcend, srcend, 64
202 subs count, count, 64
203 b.hi L(loop64_backwards)
204
205 /* Write the last iteration and copy 64 bytes from the start. */
206 L(copy64_from_start):
207 ldp E_q, F_q, [src, 32]
208 stp A_q, B_q, [dstend, -32]
209 ldp A_q, B_q, [src]
210 stp C_q, D_q, [dstend, -64]
211 stp E_q, F_q, [dstin, 32]
212 stp A_q, B_q, [dstin]
213 L(return):
214 ret
215
216 END (__memmove_sve)
217 libc_hidden_builtin_def (__memmove_sve)
218 #endif