1 /* Function exp2f vectorized with SSE4.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * exp2(x) = 2^n * T[j] * (1 + P(y))
24 * x = m*(1/K) + y, y in [-1/K..1/K]
25 * m = n*K + j, m, n,j - signed integer, j in [-K/2..K/2]
27 * values of 2^j/K are tabulated
29 * P(y) is a minimax polynomial approximation of exp2(x)-1
30 * on small interval [-1/K..1/K]
37 * exp2(x) = 1 for subnormals
39 * if x >= 128.0 then exp2f(x) overflow
40 * if x < -151.0 then exp2f(x) underflow
44 /* Offsets for data table __svml_sexp2_data_internal
55 #define _iDomainRange 144
59 .section .text.sse4, "ax", @progbits
60 ENTRY(_ZGVbN4v_exp2f_sse4)
62 cfi_def_cfa_offset(80)
64 /* Check for overflow\underflow */
65 movups __svml_sexp2_data_internal(%rip), %xmm1
71 movups _sPC6+__svml_sexp2_data_internal(%rip), %xmm4
78 /* Check for overflow\underflow */
79 movdqu _iAbsMask+__svml_sexp2_data_internal(%rip), %xmm2
85 pcmpgtd _iDomainRange+__svml_sexp2_data_internal(%rip), %xmm2
89 addps _sPC5+__svml_sexp2_data_internal(%rip), %xmm4
91 addps _sPC4+__svml_sexp2_data_internal(%rip), %xmm4
93 addps _sPC3+__svml_sexp2_data_internal(%rip), %xmm4
95 addps _sPC2+__svml_sexp2_data_internal(%rip), %xmm4
97 addps _sPC1+__svml_sexp2_data_internal(%rip), %xmm4
99 addps _sPC0+__svml_sexp2_data_internal(%rip), %xmm1
105 /* Go to special inputs processing branch */
106 jne L(SPECIAL_VALUES_BRANCH)
107 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
110 * and exit the function
116 cfi_def_cfa_offset(8)
118 cfi_def_cfa_offset(80)
124 L(SPECIAL_VALUES_BRANCH):
125 movups %xmm0, 32(%rsp)
126 movups %xmm1, 48(%rsp)
127 # LOE rbx rbp r12 r13 r14 r15 edx
138 # LOE rbx rbp r15 r12d r13d
147 /* Call scalar math function */
148 jc L(SCALAR_MATH_CALL)
149 # LOE rbx rbp r15 r12d r13d
155 L(SPECIAL_VALUES_LOOP):
159 /* Check bits in range mask */
160 jl L(RANGEMASK_CHECK)
161 # LOE rbx rbp r15 r12d r13d
169 movups 48(%rsp), %xmm1
176 # LOE rbx rbp r12 r13 r14 r15 xmm1
178 /* Scalar math fucntion call
179 * to process special input
184 movss 32(%rsp, %r14, 4), %xmm0
186 # LOE rbx rbp r14 r15 r12d r13d xmm0
188 movss %xmm0, 48(%rsp, %r14, 4)
190 /* Process special inputs in loop */
191 jmp L(SPECIAL_VALUES_LOOP)
192 # LOE rbx rbp r15 r12d r13d
193 END(_ZGVbN4v_exp2f_sse4)
195 .section .rodata, "a"
198 #ifdef __svml_sexp2_data_internal_typedef
199 typedef unsigned int VUINT32;
201 __declspec(align(16)) VUINT32 _sShifter[4][1];
202 __declspec(align(16)) VUINT32 _sPC0[4][1];
203 __declspec(align(16)) VUINT32 _sPC1[4][1];
204 __declspec(align(16)) VUINT32 _sPC2[4][1];
205 __declspec(align(16)) VUINT32 _sPC3[4][1];
206 __declspec(align(16)) VUINT32 _sPC4[4][1];
207 __declspec(align(16)) VUINT32 _sPC5[4][1];
208 __declspec(align(16)) VUINT32 _sPC6[4][1];
209 __declspec(align(16)) VUINT32 _iAbsMask[4][1];
210 __declspec(align(16)) VUINT32 _iDomainRange[4][1];
211 } __svml_sexp2_data_internal;
213 __svml_sexp2_data_internal:
214 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
216 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC0 */
218 .long 0x3f317218, 0x3f317218, 0x3f317218, 0x3f317218 /* _sPC1 */
220 .long 0x3e75fdef, 0x3e75fdef, 0x3e75fdef, 0x3e75fdef /* _sPC2 */
222 .long 0x3d6357cf, 0x3d6357cf, 0x3d6357cf, 0x3d6357cf /* _sPC3 */
224 .long 0x3c1d962c, 0x3c1d962c, 0x3c1d962c, 0x3c1d962c /* _sPC4 */
226 .long 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51, 0x3aaf7a51 /* _sPC5 */
228 .long 0x39213c8c, 0x39213c8c, 0x39213c8c, 0x39213c8c /* _sPC6 */
231 .long 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff /* _iAbsMask */
233 .long 0x42fc0000, 0x42fc0000, 0x42fc0000, 0x42fc0000 /* _iDomainRange=126.0 */
235 .type __svml_sexp2_data_internal, @object
236 .size __svml_sexp2_data_internal, .-__svml_sexp2_data_internal