1 /* Function log10 vectorized with AVX-512.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Get short reciprocal approximation Rcp ~ 1/mantissa(x)
24 * log10(x) = k*log10(2.0) - log10(Rcp) + poly_approximation(R)
25 * log10(Rcp) is tabulated
30 /* Offsets for data table __svml_dlog10_data_internal_avx512
35 #define poly_coeff9 256
36 #define poly_coeff8 320
37 #define poly_coeff7 384
38 #define poly_coeff6 448
39 #define poly_coeff5 512
40 #define poly_coeff4 576
41 #define poly_coeff3 640
42 #define poly_coeff2 704
43 #define poly_coeff1 768
48 .section .text.evex512, "ax", @progbits
49 ENTRY(_ZGVeN8v_log10_skx)
51 cfi_def_cfa_offset(16)
58 vgetmantpd $8, {sae}, %zmm7, %zmm6
59 vmovups One+__svml_dlog10_data_internal_avx512(%rip), %zmm3
60 vmovups poly_coeff5+__svml_dlog10_data_internal_avx512(%rip), %zmm12
61 vmovups poly_coeff3+__svml_dlog10_data_internal_avx512(%rip), %zmm13
63 /* Start polynomial evaluation */
64 vmovups poly_coeff9+__svml_dlog10_data_internal_avx512(%rip), %zmm10
65 vmovups poly_coeff8+__svml_dlog10_data_internal_avx512(%rip), %zmm1
66 vmovups poly_coeff7+__svml_dlog10_data_internal_avx512(%rip), %zmm11
67 vmovups poly_coeff6+__svml_dlog10_data_internal_avx512(%rip), %zmm14
69 /* Prepare exponent correction: DblRcp<0.75? */
70 vmovups C075+__svml_dlog10_data_internal_avx512(%rip), %zmm2
73 vmovups __svml_dlog10_data_internal_avx512(%rip), %zmm5
76 vgetexppd {sae}, %zmm7, %zmm0
78 /* DblRcp ~ 1/Mantissa */
82 vfpclasspd $94, %zmm7, %k0
84 /* round DblRcp to 4 fractional bits (RN mode, no Precision exception) */
85 vrndscalepd $88, {sae}, %zmm8, %zmm4
86 vmovups poly_coeff4+__svml_dlog10_data_internal_avx512(%rip), %zmm8
89 /* Reduced argument: R = DblRcp*Mantissa - 1 */
90 vfmsub213pd {rn-sae}, %zmm3, %zmm4, %zmm6
91 vcmppd $17, {sae}, %zmm2, %zmm4, %k1
92 vfmadd231pd {rn-sae}, %zmm6, %zmm12, %zmm8
93 vmovups poly_coeff2+__svml_dlog10_data_internal_avx512(%rip), %zmm12
94 vfmadd231pd {rn-sae}, %zmm6, %zmm10, %zmm1
95 vfmadd231pd {rn-sae}, %zmm6, %zmm11, %zmm14
96 vmovups poly_coeff1+__svml_dlog10_data_internal_avx512(%rip), %zmm2
99 vmulpd {rn-sae}, %zmm6, %zmm6, %zmm15
100 vfmadd231pd {rn-sae}, %zmm6, %zmm13, %zmm12
102 /* Prepare table index */
103 vpsrlq $48, %zmm4, %zmm9
105 /* add 1 to Expon if DblRcp<0.75 */
106 vaddpd {rn-sae}, %zmm3, %zmm0, %zmm0{%k1}
107 vmulpd {rn-sae}, %zmm15, %zmm15, %zmm13
108 vfmadd213pd {rn-sae}, %zmm14, %zmm15, %zmm1
109 vfmadd213pd {rn-sae}, %zmm12, %zmm15, %zmm8
110 vpermt2pd Log_tbl+64+__svml_dlog10_data_internal_avx512(%rip), %zmm9, %zmm5
113 vfmadd213pd {rn-sae}, %zmm8, %zmm13, %zmm1
114 vfmadd213pd {rn-sae}, %zmm2, %zmm6, %zmm1
115 vfmadd213pd {rn-sae}, %zmm5, %zmm1, %zmm6
116 vmovups L2+__svml_dlog10_data_internal_avx512(%rip), %zmm1
117 vfmadd213pd {rn-sae}, %zmm6, %zmm1, %zmm0
120 /* Go to special inputs processing branch */
121 jne L(SPECIAL_VALUES_BRANCH)
122 # LOE rbx r12 r13 r14 r15 edx zmm0 zmm7
125 * and exit the function
141 L(SPECIAL_VALUES_BRANCH):
142 vmovups %zmm7, 64(%rsp)
143 vmovups %zmm0, 128(%rsp)
144 # LOE rbx r12 r13 r14 r15 edx zmm0
147 # LOE rbx r12 r13 r14 r15 eax edx
151 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
152 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
155 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
156 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
159 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
160 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
161 # LOE rbx r15 r12d r13d
170 /* Call scalar math function */
171 jc L(SCALAR_MATH_CALL)
172 # LOE rbx r15 r12d r13d
178 L(SPECIAL_VALUES_LOOP):
182 /* Check bits in range mask */
183 jl L(RANGEMASK_CHECK)
184 # LOE rbx r15 r12d r13d
192 vmovups 128(%rsp), %zmm0
196 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */
197 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22
198 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */
199 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22
200 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */
201 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22
202 # LOE rbx r12 r13 r14 r15 zmm0
204 /* Scalar math fucntion call
205 * to process special input
210 vmovsd 64(%rsp, %r14, 8), %xmm0
212 # LOE rbx r14 r15 r12d r13d xmm0
214 vmovsd %xmm0, 128(%rsp, %r14, 8)
216 /* Process special inputs in loop */
217 jmp L(SPECIAL_VALUES_LOOP)
218 # LOE rbx r15 r12d r13d
219 END(_ZGVeN8v_log10_skx)
221 .section .rodata, "a"
224 #ifdef __svml_dlog10_data_internal_avx512_typedef
225 typedef unsigned int VUINT32;
227 __declspec(align(64)) VUINT32 Log_tbl[16][2];
228 __declspec(align(64)) VUINT32 One[8][2];
229 __declspec(align(64)) VUINT32 C075[8][2];
230 __declspec(align(64)) VUINT32 poly_coeff9[8][2];
231 __declspec(align(64)) VUINT32 poly_coeff8[8][2];
232 __declspec(align(64)) VUINT32 poly_coeff7[8][2];
233 __declspec(align(64)) VUINT32 poly_coeff6[8][2];
234 __declspec(align(64)) VUINT32 poly_coeff5[8][2];
235 __declspec(align(64)) VUINT32 poly_coeff4[8][2];
236 __declspec(align(64)) VUINT32 poly_coeff3[8][2];
237 __declspec(align(64)) VUINT32 poly_coeff2[8][2];
238 __declspec(align(64)) VUINT32 poly_coeff1[8][2];
239 __declspec(align(64)) VUINT32 L2[8][2];
240 } __svml_dlog10_data_internal_avx512;
242 __svml_dlog10_data_internal_avx512:
244 .quad 0x0000000000000000
245 .quad 0xbf9af5f92b00e610
246 .quad 0xbfaa30a9d609efea
247 .quad 0xbfb31b3055c47118
248 .quad 0xbfb8cf183886480d
249 .quad 0xbfbe3bc1ab0e19fe
250 .quad 0xbfc1b3e71ec94f7b
251 .quad 0xbfc42c7e7fe3fc02
252 .quad 0x3fbffbfc2bbc7803
253 .quad 0x3fbb721cd17157e3
254 .quad 0x3fb715d0ce367afc
255 .quad 0x3fb2e3a740b7800f
256 .quad 0x3fadb11ed766abf4
257 .quad 0x3fa5e3966b7e9295
258 .quad 0x3f9cb38fccd8bfdb
259 .quad 0x3f8c3d0837784c41
262 .quad 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000
265 .quad 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000, 0x3fe8000000000000
268 .quad 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370, 0x3fa8c2d828480370
271 .quad 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814, 0xbfabd80d96029814
274 .quad 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2, 0x3fafc3f6f38b58a2
277 .quad 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80, 0xbfb287a63464dc80
280 .quad 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9, 0x3fb63c62777f27d9
283 .quad 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3, 0xbfbbcb7b153c06a3
286 .quad 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c, 0x3fc287a7636f428c
289 .quad 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db, 0xbfcbcb7b1526e4db
292 .quad 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e, 0x3fdbcb7b1526e50e
295 .quad 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff, 0x3fd34413509f79ff
297 .type __svml_dlog10_data_internal_avx512, @object
298 .size __svml_dlog10_data_internal_avx512, .-__svml_dlog10_data_internal_avx512