1 /* Function exp10f vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
21 * Typical exp10() implementation, except that:
22 * - tables are small (16 elements), allowing for fast gathers
23 * - all arguments processed in the main path
24 * - final VSCALEF assists branch-free design (correct overflow/underflow and special case responses)
25 * - a VAND is used to ensure the reduced argument |R|<2, even for large inputs
26 * - RZ mode used to avoid oveflow to +/-Inf for x*log2(e); helps with special case handling
27 * - SAE used to avoid spurious flag settings
31 /* Offsets for data table __svml_sexp10_data_internal_avx512
42 #define poly_coeff2 704
43 #define poly_coeff1 768
47 .section .text.exex512, "ax", @progbits
48 ENTRY(_ZGVeN16v_exp10f_skx)
50 cfi_def_cfa_offset(16)
56 vmovups L2E+__svml_sexp10_data_internal_avx512(%rip), %zmm2
57 vmovups Shifter+__svml_sexp10_data_internal_avx512(%rip), %zmm1
58 vmovups L2H+__svml_sexp10_data_internal_avx512(%rip), %zmm5
59 vmovups L2L+__svml_sexp10_data_internal_avx512(%rip), %zmm4
61 /* ensure |R|<2 even for special cases */
62 vmovups EMask+__svml_sexp10_data_internal_avx512(%rip), %zmm6
63 vmovups poly_coeff2+__svml_sexp10_data_internal_avx512(%rip), %zmm9
65 /* 2^(52-4)*1.5 + x * log2(e) */
66 vfmadd213ps {rz-sae}, %zmm1, %zmm0, %zmm2
67 vmovups poly_coeff1+__svml_sexp10_data_internal_avx512(%rip), %zmm10
68 vmovups __svml_sexp10_data_internal_avx512(%rip), %zmm8
69 vmovups Exp_tbl_H+__svml_sexp10_data_internal_avx512(%rip), %zmm15
70 vmovups Threshold+__svml_sexp10_data_internal_avx512(%rip), %zmm13
71 vpsrld $5, %zmm2, %zmm3
73 /* Z0 ~ x*log2(e), rounded down to 6 fractional bits */
74 vsubps {rn-sae}, %zmm1, %zmm2, %zmm1
75 vpermt2ps Exp_tbl_L+64+__svml_sexp10_data_internal_avx512(%rip), %zmm2, %zmm8
76 vpermt2ps Exp_tbl_H+64+__svml_sexp10_data_internal_avx512(%rip), %zmm3, %zmm15
77 vandps AbsMask+__svml_sexp10_data_internal_avx512(%rip), %zmm0, %zmm12
79 /* R = x - Z0*log(2) */
80 vfnmadd213ps {rn-sae}, %zmm0, %zmm1, %zmm5
81 vcmpps $29, {sae}, %zmm13, %zmm12, %k0
82 vfnmadd231ps {rn-sae}, %zmm1, %zmm4, %zmm5
84 vrangeps $2, {sae}, %zmm6, %zmm5, %zmm11
85 vfmadd231ps {rn-sae}, %zmm11, %zmm9, %zmm10
86 vmulps {rn-sae}, %zmm11, %zmm10, %zmm14
89 vpxord %zmm7, %zmm7, %zmm7
90 vcmpps $4, {sae}, %zmm7, %zmm0, %k1
93 vmulps {rn-sae}, %zmm8, %zmm15, %zmm15{%k1}
94 vfmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm15
95 vscalefps {rn-sae}, %zmm1, %zmm15, %zmm1
98 /* Go to special inputs processing branch */
99 jne L(SPECIAL_VALUES_BRANCH)
100 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm1
103 * and exit the function
120 L(SPECIAL_VALUES_BRANCH):
121 vmovups %zmm0, 64(%rsp)
122 vmovups %zmm1, 128(%rsp)
123 # LOE rbx r12 r13 r14 r15 edx zmm1
126 # LOE rbx r12 r13 r14 r15 eax edx
130 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
131 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
134 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
135 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
138 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
139 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
140 # LOE rbx r15 r12d r13d
149 /* Call scalar math function */
150 jc L(SCALAR_MATH_CALL)
151 # LOE rbx r15 r12d r13d
157 L(SPECIAL_VALUES_LOOP):
161 /* Check bits in range mask */
162 jl L(RANGEMASK_CHECK)
163 # LOE rbx r15 r12d r13d
171 vmovups 128(%rsp), %zmm1
175 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
176 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
177 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
178 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
179 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
180 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
181 # LOE rbx r12 r13 r14 r15 zmm1
183 /* Scalar math fucntion call
184 * to process special input
189 vmovss 64(%rsp, %r14, 4), %xmm0
191 # LOE rbx r14 r15 r12d r13d xmm0
193 vmovss %xmm0, 128(%rsp, %r14, 4)
195 /* Process special inputs in loop */
196 jmp L(SPECIAL_VALUES_LOOP)
197 # LOE rbx r15 r12d r13d
198 END(_ZGVeN16v_exp10f_skx)
200 .section .rodata, "a"
203 #ifdef __svml_sexp10_data_internal_avx512_typedef
204 typedef unsigned int VUINT32;
206 __declspec(align(64)) VUINT32 Exp_tbl_L[32][1];
207 __declspec(align(64)) VUINT32 Exp_tbl_H[32][1];
208 __declspec(align(64)) VUINT32 L2E[16][1];
209 __declspec(align(64)) VUINT32 Shifter[16][1];
210 __declspec(align(64)) VUINT32 L2H[16][1];
211 __declspec(align(64)) VUINT32 L2L[16][1];
212 __declspec(align(64)) VUINT32 EMask[16][1];
213 __declspec(align(64)) VUINT32 AbsMask[16][1];
214 __declspec(align(64)) VUINT32 Threshold[16][1];
215 __declspec(align(64)) VUINT32 poly_coeff2[16][1];
216 __declspec(align(64)) VUINT32 poly_coeff1[16][1];
217 } __svml_sexp10_data_internal_avx512;
219 __svml_sexp10_data_internal_avx512:
221 .long 0x3f800001, 0x3f801631, 0x3f802c65, 0x3f80429d
222 .long 0x3f8058d9, 0x3f806f18, 0x3f80855c, 0x3f809ba3
223 .long 0x3f80b1ee, 0x3f80c83d, 0x3f80de90, 0x3f80f4e7
224 .long 0x3f810b42, 0x3f8121a0, 0x3f813803, 0x3f814e69
225 .long 0x3f8164d3, 0x3f817b41, 0x3f8191b3, 0x3f81a829
226 .long 0x3f81bea2, 0x3f81d520, 0x3f81eba2, 0x3f820227
227 .long 0x3f8218b0, 0x3f822f3d, 0x3f8245cf, 0x3f825c64
228 .long 0x3f8272fd, 0x3f828999, 0x3f82a03a, 0x3f82b6df
231 .long 0x3f800000, 0x3f82cd87, 0x3f85aac3, 0x3f88980f
232 .long 0x3f8b95c2, 0x3f8ea43a, 0x3f91c3d3, 0x3f94f4f0
233 .long 0x3f9837f0, 0x3f9b8d3a, 0x3f9ef532, 0x3fa27043
234 .long 0x3fa5fed7, 0x3fa9a15b, 0x3fad583f, 0x3fb123f6
235 .long 0x3fb504f3, 0x3fb8fbaf, 0x3fbd08a4, 0x3fc12c4d
236 .long 0x3fc5672a, 0x3fc9b9be, 0x3fce248c, 0x3fd2a81e
237 .long 0x3fd744fd, 0x3fdbfbb8, 0x3fe0ccdf, 0x3fe5b907
238 .long 0x3feac0c7, 0x3fefe4ba, 0x3ff5257d, 0x3ffa83b3
241 .long 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78, 0x40549A78
242 /* Shifter=2^(23-10)*1.5 */
244 .long 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000, 0x46400000
245 /* L2H = log(2)_high */
247 .long 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b, 0x3e9a209b
248 /* L2L = log(2)_low */
250 .long 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860, 0xb2760860
253 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000
256 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff
259 .long 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818, 0x4217B818
262 .long 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA, 0x4029B7DA
265 .long 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D, 0x40135D8D
267 .type __svml_sexp10_data_internal_avx512, @object
268 .size __svml_sexp10_data_internal_avx512, .-__svml_sexp10_data_internal_avx512