1 /* Function atanhf vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute atanh(x) as 0.5 * log((1 + x)/(1 - x))
23 * using small lookup table that map to AVX-512 permute instructions
30 * atanh(x) = NaN if |x| > 1, or if x is a NaN or INF
34 /* Offsets for data table __svml_satanh_data_internal_avx512 and
35 __svml_satanh_data_internal_avx512_al64. Ordered by use in the
36 function. On cold-starts this might help the prefetcher. Possibly
37 a better idea is to interleave start/end so that the prefetcher is
38 less likely to detect a stream and pull irrelivant lines into
41 /* Offset into __svml_satanh_data_internal_avx512. 4-byte aligned as
42 the memory is broadcast to {1to16}. */
45 /* Offset into __svml_satanh_data_internal_avx512_al64. The full value
49 #define RcpBitMask 128
50 #define Log_tbl_L_lo 192
51 #define Log_tbl_L_hi 256
52 #define Log_tbl_H_lo 320
53 #define Log_tbl_H_hi 384
56 #define poly_coeff3 576
57 #define poly_coeff2 640
58 #define poly_coeff1 704
62 #define ATANHF_DATA(x) ((x)+__svml_satanh_data_internal_avx512_al64)
64 .section .text.exex512, "ax", @progbits
65 ENTRY(_ZGVeN16v_atanhf_skx)
66 vandps AbsMask+__svml_satanh_data_internal_avx512(%rip){1to16}, %zmm0, %zmm6
67 vmovups ATANHF_DATA(One)(%rip), %zmm4
70 vaddps {rn-sae}, %zmm4, %zmm6, %zmm9
73 vsubps {rn-sae}, %zmm6, %zmm4, %zmm8
75 /* round reciprocals to 1+5b mantissas */
76 vmovups ATANHF_DATA(AddB5)(%rip), %zmm14
77 vmovups ATANHF_DATA(RcpBitMask)(%rip), %zmm1
80 vrcp14ps %zmm9, %zmm12
83 vrcp14ps %zmm8, %zmm13
86 vsubps {rn-sae}, %zmm4, %zmm9, %zmm2
89 vsubps {rn-sae}, %zmm4, %zmm8, %zmm5
92 /* input outside (-1, 1) ? */
93 vpaddd %zmm14, %zmm12, %zmm15
94 vpaddd %zmm14, %zmm13, %zmm12
97 vsubps {rn-sae}, %zmm2, %zmm6, %zmm3
98 vandps %zmm1, %zmm15, %zmm7
99 vandps %zmm1, %zmm12, %zmm12
102 vaddps {rn-sae}, %zmm5, %zmm6, %zmm5
104 /* Reduced argument: Rp = (RcpP*Yp - 1)+RcpP*Yp_low */
105 vfmsub213ps {rn-sae}, %zmm4, %zmm7, %zmm9
107 /* Reduced argument: Rm = (RcpM*Ym - 1)+RcpM*Ym_low */
108 vfmsub213ps {rn-sae}, %zmm4, %zmm12, %zmm8
110 vmovups ATANHF_DATA(Log_tbl_L_lo)(%rip), %zmm10
111 vmovups ATANHF_DATA(Log_tbl_L_hi)(%rip), %zmm13
114 vfmadd231ps {rn-sae}, %zmm7, %zmm3, %zmm9
115 vgetexpps {sae}, %zmm7, %zmm15
119 vfnmadd231ps {rn-sae}, %zmm12, %zmm5, %zmm8
120 vgetexpps {sae}, %zmm12, %zmm14
123 /* Prepare table index */
124 vpsrld $18, %zmm7, %zmm3
125 vpsrld $18, %zmm12, %zmm2
126 vmovups ATANHF_DATA(Log_tbl_H_lo)(%rip), %zmm11
127 vmovups ATANHF_DATA(Log_tbl_H_hi)(%rip), %zmm7
131 vpermi2ps %zmm13, %zmm10, %zmm3
132 vpermt2ps %zmm13, %zmm2, %zmm10
133 vpermi2ps %zmm7, %zmm11, %zmm5
134 vpermt2ps %zmm7, %zmm2, %zmm11
135 vsubps {rn-sae}, %zmm15, %zmm14, %zmm1
136 vsubps {rn-sae}, %zmm3, %zmm10, %zmm7
139 vmovups ATANHF_DATA(L2H)(%rip), %zmm2
142 vmovups ATANHF_DATA(L2L)(%rip), %zmm3
145 vsubps {rn-sae}, %zmm5, %zmm11, %zmm5
146 vfmadd231ps {rn-sae}, %zmm1, %zmm2, %zmm5
147 vfmadd213ps {rn-sae}, %zmm7, %zmm3, %zmm1
149 vmovups ATANHF_DATA(poly_coeff3)(%rip), %zmm7
150 vmovups ATANHF_DATA(poly_coeff2)(%rip), %zmm10
151 vmovaps %zmm10, %zmm14
152 vfmadd231ps {rn-sae}, %zmm9, %zmm7, %zmm10
153 vfmadd231ps {rn-sae}, %zmm8, %zmm7, %zmm14
154 vmovups ATANHF_DATA(poly_coeff1)(%rip), %zmm12
155 vfmadd213ps {rn-sae}, %zmm12, %zmm9, %zmm10
156 vfmadd213ps {rn-sae}, %zmm12, %zmm8, %zmm14
157 vfmadd213ps {rn-sae}, %zmm4, %zmm9, %zmm10
158 vfmadd213ps {rn-sae}, %zmm4, %zmm8, %zmm14
160 /* (K*L2L + Tl) + Rp*PolyP */
161 vfmadd213ps {rn-sae}, %zmm1, %zmm9, %zmm10
163 /* zmm12 = zmm12 & (zmm4 | zmm0). */
164 vpternlogq $0xe0, %zmm0, %zmm4, %zmm12
166 /* (K*L2L + Tl) + Rp*PolyP -Rm*PolyM */
167 vfnmadd213ps {rn-sae}, %zmm5, %zmm8, %zmm14
168 vaddps {rn-sae}, %zmm14, %zmm10, %zmm8
170 vcmpps $21, {sae}, %zmm4, %zmm6, %k0
174 /* Go to special inputs processing branch */
175 jne L(SPECIAL_VALUES_BRANCH)
176 # LOE rbx r12 r13 r14 r15 zmm0 zmm8 zmm12
177 vmulps {rn-sae}, %zmm12, %zmm8, %zmm0
179 /* No register to restore on fast path. */
182 /* Cold case. edx has 1s where there was a special value that
183 needs to be handled by a atanhf call. Optimize for code size
184 more so than speed here. */
185 L(SPECIAL_VALUES_BRANCH):
186 # LOE rbx rdx r12 r13 r14 r15 zmm0 zmm8 zmm12
187 /* Use r13 to save/restore the stack. This allows us to use rbp as
188 callee save register saving code size. */
190 cfi_adjust_cfa_offset(8)
192 /* Need to callee save registers to preserve state across tanhf calls.
195 cfi_adjust_cfa_offset(8)
198 cfi_adjust_cfa_offset(8)
201 cfi_def_cfa_register(r13)
203 /* Align stack and make room for 2x zmm vectors. */
206 vmulps {rn-sae}, %zmm12, %zmm8, %zmm1
207 vmovaps %zmm1, (%rsp)
208 vmovaps %zmm0, 64(%rsp)
211 /* edx has 1s where there was a special value that needs to be handled
214 L(SPECIAL_VALUES_LOOP):
215 # LOE rbx rbp r12 r13 r14 r15
216 /* use rbp as index for special value that is saved across calls to
217 atanhf. We technically don't need a callee save register here as offset
218 to rsp is always [0, 56] so we can restore rsp by realigning to 64.
219 Essentially the tradeoff is 1 extra save/restore vs 2 extra instructions
220 in the loop. Realigning also costs more code size. */
224 /* Scalar math fucntion call to process special input. */
225 vmovss 64(%rsp, %rbp, 4), %xmm0
228 /* No good way to avoid the store-forwarding fault this will cause on
229 return. `lfence` avoids the SF fault but at greater cost as it
230 serialized stack/callee save restoration. */
231 vmovss %xmm0, (%rsp, %rbp, 4)
234 jnz L(SPECIAL_VALUES_LOOP)
235 # LOE r12 r13 r14 r15
237 /* All results have been written to (%rsp). */
238 vmovaps (%rsp), %zmm0
241 cfi_def_cfa_register(rsp)
242 /* Restore callee save registers. */
244 cfi_adjust_cfa_offset(-8)
247 cfi_adjust_cfa_offset(-8)
250 cfi_adjust_cfa_offset(-8)
253 END(_ZGVeN16v_atanhf_skx)
255 .section .rodata, "a"
257 #ifdef __svml_satanh_data_internal_avx512_typedef
258 typedef unsigned int VUINT32;
260 __declspec(align(4)) VUINT32 AbsMask[1][1];
261 __declspec(align(64)) VUINT32 One[16][1];
262 __declspec(align(64)) VUINT32 AddB5[16][1];
263 __declspec(align(64)) VUINT32 RcpBitMask[16][1];
264 __declspec(align(64)) VUINT32 Log_tbl_L_lo[16][1];
265 __declspec(align(64)) VUINT32 Log_tbl_L_hi[16][1];
266 __declspec(align(64)) VUINT32 Log_tbl_H_lo[16][1];
267 __declspec(align(64)) VUINT32 Log_tbl_H_hi[16][1];
268 __declspec(align(64)) VUINT32 L2H[16][1];
269 __declspec(align(64)) VUINT32 L2L[16][1];
270 __declspec(align(64)) VUINT32 poly_coeff3[16][1];
271 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
272 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
273 } __svml_satanh_data_internal_avx512;
275 __svml_satanh_data_internal_avx512:
276 /* Leave this at front so we can potentially save space due to
277 smaller alignment constraint. */
282 __svml_satanh_data_internal_avx512_al64:
284 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
285 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
286 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
287 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
290 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
291 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
292 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
293 .long 0x00020000, 0x00020000, 0x00020000, 0x00020000
296 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
297 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
298 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
299 .long 0xfffc0000, 0xfffc0000, 0xfffc0000, 0xfffc0000
372 /* L2H = log(2)_high */
374 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
375 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
376 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
377 .long 0x3f317000, 0x3f317000, 0x3f317000, 0x3f317000
378 /* L2L = log(2)_low */
380 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
381 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
382 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
383 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4
386 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
387 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
388 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
389 .long 0xbe800810, 0xbe800810, 0xbe800810, 0xbe800810
392 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
393 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
394 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
395 .long 0x3eaab11e, 0x3eaab11e, 0x3eaab11e, 0x3eaab11e
398 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
399 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
400 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
401 .long 0xbf000000, 0xbf000000, 0xbf000000, 0xbf000000
403 .type __svml_satanh_data_internal_avx512_al64, @object
404 .size __svml_satanh_data_internal_avx512_al64, .-__svml_satanh_data_internal_avx512_al64
405 .type __svml_satanh_data_internal_avx512, @object
406 .size __svml_satanh_data_internal_avx512, .-__svml_satanh_data_internal_avx512