1 /* Function acos vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * SelMask = (|x| >= 0.5) ? 1 : 0;
23 * R = SelMask ? sqrt(0.5 - 0.5*|x|) : |x|
24 * acos(|x|) = SelMask ? 2*Poly(R) : (Pi/2 - Poly(R))
25 * acos(x) = sign(x) ? (Pi - acos(|x|)) : acos(|x|)
29 /* Offsets for data table __svml_dacos_data_internal
36 #define sqrt_coeff_1 320
37 #define sqrt_coeff_2 384
38 #define sqrt_coeff_3 448
39 #define sqrt_coeff_4 512
40 #define poly_coeff_1 576
41 #define poly_coeff_2 640
42 #define poly_coeff_3 704
43 #define poly_coeff_4 768
44 #define poly_coeff_5 832
45 #define poly_coeff_6 896
46 #define poly_coeff_7 960
47 #define poly_coeff_8 1024
48 #define poly_coeff_9 1088
49 #define poly_coeff_10 1152
50 #define poly_coeff_11 1216
51 #define poly_coeff_12 1280
57 .section .text.evex512, "ax", @progbits
58 ENTRY(_ZGVeN8v_acos_skx)
60 cfi_def_cfa_offset(16)
66 vmovups __svml_dacos_data_internal(%rip), %zmm7
67 vmovups OneHalf+__svml_dacos_data_internal(%rip), %zmm8
70 vmovups SmallNorm+__svml_dacos_data_internal(%rip), %zmm11
71 vmovups Two+__svml_dacos_data_internal(%rip), %zmm14
72 vmovups sqrt_coeff_1+__svml_dacos_data_internal(%rip), %zmm15
73 vmovups sqrt_coeff_2+__svml_dacos_data_internal(%rip), %zmm2
74 vmovups sqrt_coeff_3+__svml_dacos_data_internal(%rip), %zmm1
75 vmovups MOne+__svml_dacos_data_internal(%rip), %zmm10
79 vorpd %zmm6, %zmm7, %zmm5
80 vandpd %zmm6, %zmm7, %zmm4
82 /* Y = 0.5 + 0.5*(-x) */
83 vfmadd231pd {rn-sae}, %zmm5, %zmm8, %zmm8
86 vmulpd {rn-sae}, %zmm5, %zmm5, %zmm9
87 vrsqrt14pd %zmm8, %zmm12
88 vcmppd $17, {sae}, %zmm11, %zmm8, %k1
89 vcmppd $17, {sae}, %zmm10, %zmm5, %k0
90 vmovups poly_coeff_5+__svml_dacos_data_internal(%rip), %zmm10
91 vmovups poly_coeff_7+__svml_dacos_data_internal(%rip), %zmm11
92 vminpd {sae}, %zmm8, %zmm9, %zmm3
93 vmovups poly_coeff_3+__svml_dacos_data_internal(%rip), %zmm9
94 vxorpd %zmm12, %zmm12, %zmm12{%k1}
95 vaddpd {rn-sae}, %zmm8, %zmm8, %zmm0
96 vcmppd $21, {sae}, %zmm8, %zmm3, %k4
99 vcmppd $17, {sae}, %zmm3, %zmm6, %k2
100 vmulpd {rn-sae}, %zmm12, %zmm12, %zmm13
101 vmulpd {rn-sae}, %zmm12, %zmm0, %zmm7
102 vmovups poly_coeff_4+__svml_dacos_data_internal(%rip), %zmm12
105 vmovups poly_coeff_1+__svml_dacos_data_internal(%rip), %zmm8
106 vfmsub213pd {rn-sae}, %zmm14, %zmm13, %zmm0
107 vmovups sqrt_coeff_4+__svml_dacos_data_internal(%rip), %zmm13
108 vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm12
109 vmovups poly_coeff_11+__svml_dacos_data_internal(%rip), %zmm9
110 vfmadd231pd {rn-sae}, %zmm0, %zmm15, %zmm2
111 vmovups poly_coeff_9+__svml_dacos_data_internal(%rip), %zmm15
112 vmulpd {rn-sae}, %zmm0, %zmm7, %zmm14
113 vfmadd213pd {rn-sae}, %zmm1, %zmm0, %zmm2
114 vmovups poly_coeff_2+__svml_dacos_data_internal(%rip), %zmm1
116 vfmadd213pd {rn-sae}, %zmm13, %zmm0, %zmm2
117 vfmadd231pd {rn-sae}, %zmm3, %zmm8, %zmm1
118 vmovups poly_coeff_10+__svml_dacos_data_internal(%rip), %zmm8
119 vmulpd {rn-sae}, %zmm3, %zmm3, %zmm0
120 vfnmadd213pd {rn-sae}, %zmm7, %zmm14, %zmm2
121 vmovups poly_coeff_6+__svml_dacos_data_internal(%rip), %zmm7
122 vfmadd231pd {rn-sae}, %zmm3, %zmm15, %zmm8
123 vfmadd213pd {rn-sae}, %zmm12, %zmm0, %zmm1
124 vblendmpd %zmm2, %zmm5, %zmm2{%k4}
125 vfmadd231pd {rn-sae}, %zmm3, %zmm10, %zmm7
126 vmovups poly_coeff_8+__svml_dacos_data_internal(%rip), %zmm10
127 vfmadd231pd {rn-sae}, %zmm3, %zmm11, %zmm10
128 vmovups poly_coeff_12+__svml_dacos_data_internal(%rip), %zmm11
130 vfmadd213pd {rn-sae}, %zmm10, %zmm0, %zmm7
131 vfmadd231pd {rn-sae}, %zmm3, %zmm9, %zmm11
132 vmulpd {rn-sae}, %zmm0, %zmm0, %zmm10
133 vfmadd213pd {rn-sae}, %zmm7, %zmm10, %zmm1
134 vfmadd213pd {rn-sae}, %zmm8, %zmm0, %zmm1
135 vfmadd213pd {rn-sae}, %zmm11, %zmm0, %zmm1
136 vmovups Pi2H+__svml_dacos_data_internal(%rip), %zmm0
137 vmulpd {rn-sae}, %zmm3, %zmm1, %zmm1
138 vxorpd %zmm4, %zmm2, %zmm3
139 vxorpd %zmm0, %zmm0, %zmm0{%k4}
140 vfmadd213pd {rn-sae}, %zmm3, %zmm3, %zmm1
141 vorpd PiH+__svml_dacos_data_internal(%rip), %zmm0, %zmm0{%k3}
142 vaddpd {rn-sae}, %zmm1, %zmm0, %zmm0
145 /* Go to special inputs processing branch */
146 jne L(SPECIAL_VALUES_BRANCH)
147 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm6
150 * and exit the function
166 L(SPECIAL_VALUES_BRANCH):
167 vmovups %zmm6, 64(%rsp)
168 vmovups %zmm0, 128(%rsp)
169 # LOE rbx r12 r13 r14 r15 edx zmm0
172 # LOE rbx r12 r13 r14 r15 eax edx
176 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
177 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
180 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
181 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
184 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
185 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
186 # LOE rbx r15 r12d r13d
195 /* Call scalar math function */
196 jc L(SCALAR_MATH_CALL)
197 # LOE rbx r15 r12d r13d
203 L(SPECIAL_VALUES_LOOP):
207 /* Check bits in range mask */
208 jl L(RANGEMASK_CHECK)
209 # LOE rbx r15 r12d r13d
217 vmovups 128(%rsp), %zmm0
221 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
222 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
223 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
224 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
225 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
226 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
227 # LOE rbx r12 r13 r14 r15 zmm0
229 /* Scalar math fucntion call
230 * to process special input
235 vmovsd 64(%rsp, %r14, 8), %xmm0
237 # LOE rbx r14 r15 r12d r13d xmm0
239 vmovsd %xmm0, 128(%rsp, %r14, 8)
241 /* Process special inputs in loop */
242 jmp L(SPECIAL_VALUES_LOOP)
243 # LOE rbx r15 r12d r13d
244 END(_ZGVeN8v_acos_skx)
246 .section .rodata, "a"
249 #ifdef __svml_dacos_data_internal_typedef
250 typedef unsigned int VUINT32;
252 __declspec(align(64)) VUINT32 SgnBit[8][2];
253 __declspec(align(64)) VUINT32 OneHalf[8][2];
254 __declspec(align(64)) VUINT32 SmallNorm[8][2];
255 __declspec(align(64)) VUINT32 MOne[8][2];
256 __declspec(align(64)) VUINT32 Two[8][2];
257 __declspec(align(64)) VUINT32 sqrt_coeff[4][8][2];
258 __declspec(align(64)) VUINT32 poly_coeff[12][8][2];
259 __declspec(align(64)) VUINT32 PiH[8][2];
260 __declspec(align(64)) VUINT32 Pi2H[8][2];
261 } __svml_dacos_data_internal;
263 __svml_dacos_data_internal:
265 .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000
268 .quad 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000, 0x3fe0000000000000
271 .quad 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000, 0x3000000000000000
274 .quad 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000, 0xbff0000000000000
277 .quad 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000, 0x4000000000000000
280 .quad 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3, 0xbf918000993B24C3 /* sqrt_coeff4 */
281 .quad 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D, 0x3fa400006F70D42D /* sqrt_coeff3 */
282 .quad 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97, 0xbfb7FFFFFFFFFE97 /* sqrt_coeff2 */
283 .quad 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D, 0x3fcFFFFFFFFFFF9D /* sqrt_coeff1 */
286 .quad 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909, 0x3fa07520C70EB909 /* poly_coeff12 */
287 .quad 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED, 0xbf90FB17F7DBB0ED /* poly_coeff11 */
288 .quad 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE, 0x3f943F44BFBC3BAE /* poly_coeff10 */
289 .quad 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5, 0x3f7A583395D45ED5 /* poly_coeff9 */
290 .quad 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6, 0x3f88F8DC2AFCCAD6 /* poly_coeff8 */
291 .quad 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57, 0x3f8C6DBBCB88BD57 /* poly_coeff7 */
292 .quad 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E, 0x3f91C6DCF538AD2E /* poly_coeff6 */
293 .quad 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd, 0x3f96E89CEBDEFadd /* poly_coeff5 */
294 .quad 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE, 0x3f9F1C72E13AD8BE /* poly_coeff4 */
295 .quad 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8, 0x3fa6DB6DB3B445F8 /* poly_coeff3 */
296 .quad 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE, 0x3fb333333337E0DE /* poly_coeff2 */
297 .quad 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C, 0x3fc555555555529C /* poly_coeff1 */
300 .quad 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18, 0x400921fb54442d18
303 .quad 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18, 0x3ff921fb54442d18
305 .type __svml_dacos_data_internal, @object
306 .size __svml_dacos_data_internal, .-__svml_dacos_data_internal