1 /* Function coshf vectorized with AVX2.
2 Copyright (C) 2021-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 https://www.gnu.org/licenses/. */
20 * ALGORITHM DESCRIPTION:
22 * Compute cosh(x) as (exp(x)+exp(-x))/2,
23 * where exp is calculated as
24 * exp(M*ln2 + ln2*(j/2^k) + r) = 2^M * 2^(j/2^k) * exp(r)
28 * cosh(NaN) = quiet NaN, and raise invalid exception
29 * cosh(INF) = that INF
31 * cosh(x) overflows for big x and returns MAXLOG+log(2)
35 /* Offsets for data table __svml_scosh_data_internal
42 #define _iDomainRange 160
53 .section .text.avx2, "ax", @progbits
54 ENTRY(_ZGVdN8v_coshf_avx2)
56 cfi_def_cfa_offset(16)
62 vmovups _sSign+__svml_scosh_data_internal(%rip), %ymm2
63 vmovups _sShifter+__svml_scosh_data_internal(%rip), %ymm7
67 * dM = x/log(2) + RShifter
69 vmovups _sInvLn2+__svml_scosh_data_internal(%rip), %ymm10
70 vmovups _sLn2hi+__svml_scosh_data_internal(%rip), %ymm8
71 vmovups _iDomainRange+__svml_scosh_data_internal(%rip), %ymm3
74 * sinh(r) = r*((a1=1)+r^2*(a3+r^2*(a5+{v1 r^2*a7})))) = r + r*(r^2*(a3+r^2*(a5+r^2*a7))) ....
75 * sSinh_r = (a3+r^2*a5)
77 vmovups _sPC5+__svml_scosh_data_internal(%rip), %ymm15
78 vmovups _iHalf+__svml_scosh_data_internal(%rip), %ymm11
85 vandnps %ymm1, %ymm2, %ymm0
86 vfmadd213ps %ymm7, %ymm0, %ymm10
92 vsubps %ymm7, %ymm10, %ymm9
96 * iM now is an EXP(2^N)
98 vpslld $23, %ymm10, %ymm12
100 /* Check for overflow\underflow */
101 vpcmpgtd %ymm3, %ymm0, %ymm4
102 vpcmpeqd %ymm3, %ymm0, %ymm5
104 /* sR = sX - sN*Log2_hi */
105 vfnmadd231ps %ymm8, %ymm9, %ymm0
106 vpaddd %ymm12, %ymm11, %ymm13
107 vpsubd %ymm12, %ymm11, %ymm14
108 vpor %ymm5, %ymm4, %ymm6
110 /* sR = (sX - sN*Log2_hi) - sN*Log2_lo */
111 vfnmadd231ps _sLn2lo+__svml_scosh_data_internal(%rip), %ymm9, %ymm0
113 /* sG1 = 2^(N-1)-2^(-N-1) */
114 vsubps %ymm14, %ymm13, %ymm4
116 /* sG2 = 2^(N-1)+2^(-N-1) */
117 vaddps %ymm14, %ymm13, %ymm3
119 /* sR2 = sR^2, shaffled */
120 vmulps %ymm0, %ymm0, %ymm2
121 vfmadd213ps _sPC3+__svml_scosh_data_internal(%rip), %ymm2, %ymm15
123 /* sSinh_r = r^2*(a3+r^2*a5) */
124 vmulps %ymm15, %ymm2, %ymm13
126 /* sSinh_r = r + r*(r^2*(a3+r^2*a5)) */
127 vfmadd213ps %ymm0, %ymm0, %ymm13
130 * sinh(X) = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2)
131 * sOut = (a4 +a6*sR2)
133 vmovups _sPC6+__svml_scosh_data_internal(%rip), %ymm0
134 vfmadd213ps _sPC4+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
136 /* sOut = a2+sR2*(a4+a6*sR2) */
137 vfmadd213ps _sPC2+__svml_scosh_data_internal(%rip), %ymm2, %ymm0
139 /* sOut = sR2*(a2+sR2*(a4+a6*sR2) */
140 vmulps %ymm0, %ymm2, %ymm15
142 /* sOut = sG2*sR2*(a2+sR2*(a4+a6*sR2) */
143 vmulps %ymm15, %ymm3, %ymm14
145 /* sOut = sG1*sinh(dR)+sG2*sR2*(a2+sR2*(a4+a6*sR2) */
146 vfmadd213ps %ymm14, %ymm13, %ymm4
147 vmovmskps %ymm6, %edx
149 /* sOut = sG2 + sG1*sinh(dR) + sG2*sR2*(a2+sR2*(a4+a6*sR2) */
150 vaddps %ymm4, %ymm3, %ymm0
155 /* Go to special inputs processing branch */
156 jne L(SPECIAL_VALUES_BRANCH)
157 # LOE rbx r12 r13 r14 r15 edx ymm0 ymm1
160 * and exit the function
176 L(SPECIAL_VALUES_BRANCH):
177 vmovups %ymm1, 32(%rsp)
178 vmovups %ymm0, 64(%rsp)
179 # LOE rbx r12 r13 r14 r15 edx ymm0
182 # LOE rbx r12 r13 r14 r15 eax edx
186 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
187 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
190 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
191 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
194 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
195 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
196 # LOE rbx r15 r12d r13d
205 /* Call scalar math function */
206 jc L(SCALAR_MATH_CALL)
207 # LOE rbx r15 r12d r13d
213 L(SPECIAL_VALUES_LOOP):
217 /* Check bits in range mask */
218 jl L(RANGEMASK_CHECK)
219 # LOE rbx r15 r12d r13d
227 vmovups 64(%rsp), %ymm0
231 /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
232 .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
233 /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
234 .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
235 /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
236 .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
237 # LOE rbx r12 r13 r14 r15 ymm0
239 /* Scalar math fucntion call
240 * to process special input
245 vmovss 32(%rsp, %r14, 4), %xmm0
247 # LOE rbx r14 r15 r12d r13d xmm0
249 vmovss %xmm0, 64(%rsp, %r14, 4)
251 /* Process special inputs in loop */
252 jmp L(SPECIAL_VALUES_LOOP)
253 # LOE rbx r15 r12d r13d
254 END(_ZGVdN8v_coshf_avx2)
256 .section .rodata, "a"
259 #ifdef __svml_scosh_data_internal_typedef
260 typedef unsigned int VUINT32;
262 __declspec(align(32)) VUINT32 _sInvLn2[8][1];
263 __declspec(align(32)) VUINT32 _sLn2hi[8][1];
264 __declspec(align(32)) VUINT32 _sLn2lo[8][1];
265 __declspec(align(32)) VUINT32 _sSign[8][1];
266 __declspec(align(32)) VUINT32 _sShifter[8][1];
267 __declspec(align(32)) VUINT32 _iDomainRange[8][1];
268 __declspec(align(32)) VUINT32 _sPC1[8][1];
269 __declspec(align(32)) VUINT32 _sPC2[8][1];
270 __declspec(align(32)) VUINT32 _sPC3[8][1];
271 __declspec(align(32)) VUINT32 _sPC4[8][1];
272 __declspec(align(32)) VUINT32 _sPC5[8][1];
273 __declspec(align(32)) VUINT32 _sPC6[8][1];
274 __declspec(align(32)) VUINT32 _iHalf[8][1];
275 } __svml_scosh_data_internal;
277 __svml_scosh_data_internal:
278 .long 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B, 0x3FB8AA3B /* _sInvLn2 */ // k=0
280 .long 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000, 0x3F317000 /* _sLn2hi */
282 .long 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4, 0x3805fdf4 /* _sLn2lo */
284 .long 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000, 0x80000000 /* _sSign */
286 .long 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000, 0x4b400000 /* _sShifter */
288 .long 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E, 0x42AEAC4E /* _iDomainRange */
290 .long 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000 /* _sPC1=1 */
292 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _sPC2 */
294 .long 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57, 0x3e2aaa57 /* _sPC3 */
296 .long 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72, 0x3d2aaa72 /* _sPC4 */
298 .long 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461, 0x3c091461 /* _sPC5 */
300 .long 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3, 0x3ab6a8a3 /* _sPC6 */
303 .long 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000, 0x3f000000 /* _iHalf */
305 .type __svml_scosh_data_internal, @object
306 .size __svml_scosh_data_internal, .-__svml_scosh_data_internal