1 /* Function cbrt vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * x=2^{3*k+j} * 1.b1 b2 ... b5 b6 ... b52
23 * Let r=(x*2^{-3k-j} - 1.b1 b2 ... b5 1)* rcp[b1 b2 ..b5],
24 * where rcp[b1 b2 .. b5]=1/(1.b1 b2 b3 b4 b5 1) in double precision
25 * cbrt(2^j * 1. b1 b2 .. b5 1) is approximated as T[j][b1..b5]+D[j][b1..b5]
26 * (T stores the high 53 bits, D stores the low order bits)
27 * Result=2^k*T+(2^k*T*r)*P+2^k*D
28 * where P=p1+p2*r+..+p8*r^7
32 /* Offsets for data table __svml_dcbrt_data_internal_avx512
36 #define cbrt_tbl_H 128
43 #define poly_coeff10 640
44 #define poly_coeff9 704
45 #define poly_coeff8 768
46 #define poly_coeff7 832
47 #define poly_coeff6 896
48 #define poly_coeff5 960
49 #define poly_coeff4 1024
50 #define poly_coeff3 1088
51 #define poly_coeff2 1152
52 #define poly_coeff1 1216
56 .section .text.evex512, "ax", @progbits
57 ENTRY(_ZGVeN8v_cbrt_skx)
58 vgetmantpd $0, {sae}, %zmm0, %zmm14
61 vgetexppd {sae}, %zmm0, %zmm7
62 vmovups BiasL+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
65 vmovups OneThird+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
66 vmovups Bias3+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
68 /* Reduced argument: R = DblRcp*Mantissa - 1 */
69 vmovups One+__svml_dcbrt_data_internal_avx512(%rip), %zmm2
71 /* exponent%3 (to be used as index) */
72 vmovups Three+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
74 /* DblRcp ~ 1/Mantissa */
75 vrcp14pd %zmm14, %zmm13
76 vaddpd {rn-sae}, %zmm8, %zmm7, %zmm12
77 vandpd SZero+__svml_dcbrt_data_internal_avx512(%rip), %zmm0, %zmm6
79 /* round DblRcp to 3 fractional bits (RN mode, no Precision exception) */
80 vrndscalepd $72, {sae}, %zmm13, %zmm15
81 vfmsub231pd {rn-sae}, %zmm12, %zmm9, %zmm10
84 vmovups poly_coeff10+__svml_dcbrt_data_internal_avx512(%rip), %zmm0
85 vmovups poly_coeff8+__svml_dcbrt_data_internal_avx512(%rip), %zmm7
86 vmovups poly_coeff7+__svml_dcbrt_data_internal_avx512(%rip), %zmm9
87 vfmsub231pd {rn-sae}, %zmm15, %zmm14, %zmm2
88 vrndscalepd $9, {sae}, %zmm10, %zmm5
91 vmovups cbrt_tbl_H+__svml_dcbrt_data_internal_avx512(%rip), %zmm10
92 vmovups poly_coeff6+__svml_dcbrt_data_internal_avx512(%rip), %zmm8
93 vmovups poly_coeff3+__svml_dcbrt_data_internal_avx512(%rip), %zmm13
94 vfmadd231pd {rn-sae}, %zmm2, %zmm7, %zmm9
95 vfnmadd231pd {rn-sae}, %zmm5, %zmm11, %zmm12
96 vmovups poly_coeff5+__svml_dcbrt_data_internal_avx512(%rip), %zmm11
97 vmovups poly_coeff1+__svml_dcbrt_data_internal_avx512(%rip), %zmm14
99 /* Prepare table index */
100 vpsrlq $49, %zmm15, %zmm1
102 /* Table lookup: 2^(exponent%3) */
103 vpermpd __svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm4
104 vpermpd etbl_L+__svml_dcbrt_data_internal_avx512(%rip), %zmm12, %zmm3
105 vpermt2pd cbrt_tbl_H+64+__svml_dcbrt_data_internal_avx512(%rip), %zmm1, %zmm10
106 vmovups poly_coeff9+__svml_dcbrt_data_internal_avx512(%rip), %zmm1
107 vfmadd231pd {rn-sae}, %zmm2, %zmm8, %zmm11
108 vmovups poly_coeff2+__svml_dcbrt_data_internal_avx512(%rip), %zmm12
109 vscalefpd {rn-sae}, %zmm5, %zmm10, %zmm15
110 vfmadd231pd {rn-sae}, %zmm2, %zmm0, %zmm1
111 vmovups poly_coeff4+__svml_dcbrt_data_internal_avx512(%rip), %zmm5
112 vfmadd231pd {rn-sae}, %zmm2, %zmm12, %zmm14
113 vmulpd {rn-sae}, %zmm2, %zmm2, %zmm0
114 vfmadd231pd {rn-sae}, %zmm2, %zmm5, %zmm13
117 vmulpd {rn-sae}, %zmm2, %zmm4, %zmm2
118 vfmadd213pd {rn-sae}, %zmm9, %zmm0, %zmm1
119 vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
120 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm1
121 vfmadd213pd {rn-sae}, %zmm14, %zmm0, %zmm1
123 /* Sl + (Sh*R)*Poly */
124 vfmadd213pd {rn-sae}, %zmm3, %zmm1, %zmm2
128 * scaled_Th*(Sh+Sl+Sh*R*Poly)
130 vaddpd {rn-sae}, %zmm4, %zmm2, %zmm3
131 vmulpd {rn-sae}, %zmm15, %zmm3, %zmm4
132 vorpd %zmm6, %zmm4, %zmm0
135 END(_ZGVeN8v_cbrt_skx)
137 .section .rodata, "a"
140 #ifdef __svml_dcbrt_data_internal_avx512_typedef
141 typedef unsigned int VUINT32;
143 __declspec(align(64)) VUINT32 etbl_H[8][2];
144 __declspec(align(64)) VUINT32 etbl_L[8][2];
145 __declspec(align(64)) VUINT32 cbrt_tbl_H[16][2];
146 __declspec(align(64)) VUINT32 BiasL[8][2];
147 __declspec(align(64)) VUINT32 SZero[8][2];
148 __declspec(align(64)) VUINT32 OneThird[8][2];
149 __declspec(align(64)) VUINT32 Bias3[8][2];
150 __declspec(align(64)) VUINT32 Three[8][2];
151 __declspec(align(64)) VUINT32 One[8][2];
152 __declspec(align(64)) VUINT32 poly_coeff10[8][2];
153 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
154 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
155 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
156 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
157 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
158 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
159 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
160 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
161 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
162 } __svml_dcbrt_data_internal_avx512;
164 __svml_dcbrt_data_internal_avx512:
166 .quad 0x3ff0000000000000
167 .quad 0x3ff428a2f98d728b
168 .quad 0x3ff965fea53d6e3d
169 .quad 0x0000000000000000
170 .quad 0xbff0000000000000
171 .quad 0xbff428a2f98d728b
172 .quad 0xbff965fea53d6e3d
173 .quad 0x0000000000000000
176 .quad 0x0000000000000000
177 .quad 0xbc7ddc22548ea41e
178 .quad 0xbc9f53e999952f09
179 .quad 0x0000000000000000
180 .quad 0x0000000000000000
181 .quad 0x3c7ddc22548ea41e
182 .quad 0x3c9f53e999952f09
183 .quad 0x0000000000000000
186 .quad 0x3ff428a2f98d728b
187 .quad 0x3ff361f35ca116ff
188 .quad 0x3ff2b6b5edf6b54a
189 .quad 0x3ff220e6dd675180
190 .quad 0x3ff19c3b38e975a8
191 .quad 0x3ff12589c21fb842
192 .quad 0x3ff0ba6ee5f9aad4
193 .quad 0x3ff059123d3a9848
194 .quad 0x3ff0000000000000
195 .quad 0x0000000000000000
196 .quad 0x0000000000000000
197 .quad 0x0000000000000000
198 .quad 0x0000000000000000
199 .quad 0x0000000000000000
200 .quad 0x0000000000000000
201 .quad 0x0000000000000000
204 .quad 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000, 0x4338000000000000
207 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
210 .quad 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556, 0x3fd5555555555556
213 .quad 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000, 0x4320000000000000
216 .quad 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000, 0x4008000000000000
219 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
222 .quad 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62, 0xbf882e3b6adeca62
225 .quad 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875, 0x3f8bda24bae48875
228 .quad 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f, 0xbf9036b87c71d55f
231 .quad 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914, 0x3f9374ed9398b914
234 .quad 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e, 0xbf98090d77f2468e
237 .quad 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569, 0x3f9ee71141dcf569
240 .quad 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e, 0xbfa511e8d2b0363e
243 .quad 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31, 0x3faf9add3c0b7e31
246 .quad 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741, 0xbfbc71c71c71c741
249 .quad 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557, 0x3fd5555555555557
251 .type __svml_dcbrt_data_internal_avx512, @object
252 .size __svml_dcbrt_data_internal_avx512, .-__svml_dcbrt_data_internal_avx512