1 /* Function log2f vectorized with SSE4.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
24 * log2(x) = k - log2(Rcp) + poly_approximation(R)
25 * log2(Rcp) is tabulated
30 /* Offsets for data table __svml_slog2_data_internal
35 #define iOffExpoMask 48
41 .section .text.sse4, "ax", @progbits
42 ENTRY(_ZGVbN4v_log2f_sse4)
44 cfi_def_cfa_offset(80)
47 /* reduction: compute r, n */
48 movdqu iBrkValue+__svml_slog2_data_internal(%rip), %xmm2
50 movdqu iOffExpoMask+__svml_slog2_data_internal(%rip), %xmm10
56 movups sPoly+__svml_slog2_data_internal(%rip), %xmm5
57 movups sPoly+32+__svml_slog2_data_internal(%rip), %xmm6
58 movups sPoly+64+__svml_slog2_data_internal(%rip), %xmm7
59 movups sPoly+96+__svml_slog2_data_internal(%rip), %xmm9
60 cmpltps MinNorm+__svml_slog2_data_internal(%rip), %xmm4
61 cmpnleps MaxNorm+__svml_slog2_data_internal(%rip), %xmm3
63 subps One+__svml_slog2_data_internal(%rip), %xmm10
68 addps sPoly+16+__svml_slog2_data_internal(%rip), %xmm5
70 addps sPoly+48+__svml_slog2_data_internal(%rip), %xmm6
73 addps sPoly+80+__svml_slog2_data_internal(%rip), %xmm7
74 addps sPoly+112+__svml_slog2_data_internal(%rip), %xmm9
79 /* combine and get argument value range mask */
85 addps sPoly+128+__svml_slog2_data_internal(%rip), %xmm9
90 /* Go to special inputs processing branch */
91 jne L(SPECIAL_VALUES_BRANCH)
92 # LOE rbx rbp r12 r13 r14 r15 edx xmm0 xmm1
95 * and exit the function
101 cfi_def_cfa_offset(8)
103 cfi_def_cfa_offset(80)
109 L(SPECIAL_VALUES_BRANCH):
110 movups %xmm0, 32(%rsp)
111 movups %xmm1, 48(%rsp)
112 # LOE rbx rbp r12 r13 r14 r15 edx
123 # LOE rbx rbp r15 r12d r13d
132 /* Call scalar math function */
133 jc L(SCALAR_MATH_CALL)
134 # LOE rbx rbp r15 r12d r13d
140 L(SPECIAL_VALUES_LOOP):
144 /* Check bits in range mask */
145 jl L(RANGEMASK_CHECK)
146 # LOE rbx rbp r15 r12d r13d
154 movups 48(%rsp), %xmm1
161 # LOE rbx rbp r12 r13 r14 r15 xmm1
163 /* Scalar math fucntion call
164 * to process special input
169 movss 32(%rsp, %r14, 4), %xmm0
171 # LOE rbx rbp r14 r15 r12d r13d xmm0
173 movss %xmm0, 48(%rsp, %r14, 4)
175 /* Process special inputs in loop */
176 jmp L(SPECIAL_VALUES_LOOP)
177 # LOE rbx rbp r15 r12d r13d
178 END(_ZGVbN4v_log2f_sse4)
180 .section .rodata, "a"
183 #ifdef __svml_slog2_data_internal_typedef
184 typedef unsigned int VUINT32;
186 __declspec(align(16)) VUINT32 MinNorm[4][1];
187 __declspec(align(16)) VUINT32 MaxNorm[4][1];
188 __declspec(align(16)) VUINT32 iBrkValue[4][1];
189 __declspec(align(16)) VUINT32 iOffExpoMask[4][1];
190 __declspec(align(16)) VUINT32 One[4][1];
191 __declspec(align(16)) VUINT32 sPoly[9][4][1];
192 } __svml_slog2_data_internal;
194 __svml_slog2_data_internal:
196 .long 0x00800000, 0x00800000, 0x00800000, 0x00800000
199 .long 0x7f7fffff, 0x7f7fffff, 0x7f7fffff, 0x7f7fffff
200 /* iBrkValue = SP 2/3 */
202 .long 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab, 0x3f2aaaab
203 /* iOffExpoMask = SP significand mask */
205 .long 0x007fffff, 0x007fffff, 0x007fffff, 0x007fffff
208 .long 0x3f800000, 0x3f800000, 0x3f800000, 0x3f800000
211 .long 0x3e554012, 0x3e554012, 0x3e554012, 0x3e554012 /* coeff9 */
212 .long 0xbe638E14, 0xbe638E14, 0xbe638E14, 0xbe638E14 /* coeff8 */
213 .long 0x3e4D660B, 0x3e4D660B, 0x3e4D660B, 0x3e4D660B /* coeff7 */
214 .long 0xbe727824, 0xbe727824, 0xbe727824, 0xbe727824 /* coeff6 */
215 .long 0x3e93DD07, 0x3e93DD07, 0x3e93DD07, 0x3e93DD07 /* coeff5 */
216 .long 0xbeB8B969, 0xbeB8B969, 0xbeB8B969, 0xbeB8B969 /* coeff4 */
217 .long 0x3eF637C0, 0x3eF637C0, 0x3eF637C0, 0x3eF637C0 /* coeff3 */
218 .long 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B, 0xbf38AA2B /* coeff2 */
219 .long 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B, 0x3fB8AA3B /* coeff1 */
221 .type __svml_slog2_data_internal, @object
222 .size __svml_slog2_data_internal, .-__svml_slog2_data_internal