1 /* Function sincos vectorized with SSE4.
2 Copyright (C) 2014-2022 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
20 #include "svml_d_trig_data.h"
23 ENTRY (_ZGVbN2vl8l8_sincos_sse4)
25 ALGORITHM DESCRIPTION:
27 ( low accuracy ( < 4ulp ) or enhanced performance
28 ( half of correct mantissa ) implementation )
30 Argument representation:
34 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
35 arg + Pi/2 = (N'*Pi + R')
36 cos(arg) = sin(arg+Pi/2) = sin(N'*Pi + R') = (-1)^N' * sin(R')
37 sin(R), sin(R') are approximated by corresponding polynomial. */
40 cfi_adjust_cfa_offset (8)
41 cfi_rel_offset (%rbp, 0)
43 cfi_def_cfa_register (%rbp)
46 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
47 movups %xmm11, 160(%rsp)
48 movups %xmm12, 144(%rsp)
49 movups __dSignMask(%rax), %xmm11
51 /* ARGUMENT RANGE REDUCTION:
52 Absolute argument: X' = |X| */
55 /* Grab sign bit from argument */
57 movups __dInvPI(%rax), %xmm5
60 /* SinY = X'*InvPi + RS : right shifter add */
62 addpd __dRShifter(%rax), %xmm5
64 /* SinSignRes = Y<<63 : shift LSB to MSB place for result sign */
68 /* SinN = Y - RS : right shifter sub */
69 subpd __dRShifter(%rax), %xmm5
70 movups %xmm10, 176(%rsp)
72 movups __dPI1(%rax), %xmm10
74 /* SinR = X' - SinN*Pi1 */
77 movups __dPI2(%rax), %xmm6
79 /* SinR = SinR - SinN*Pi1 */
82 movups %xmm13, 112(%rsp)
87 /* Sine result sign: SinRSign = SignMask & SinR */
90 /* CosR = SinX - CosN*Pi1 */
92 movups __dOneHalf(%rax), %xmm3
95 /* Set SinRSign to 0.5 */
98 /* Update CosRSign and CosSignRes signs */
101 /* CosN = SinN +(-)0.5 */
103 cmpnlepd __dRangeVal(%rax), %xmm4
106 /* CosR = CosR - CosN*Pi2 */
110 movups __dPI3(%rax), %xmm10
114 /* SinR = SinR - SinN*Pi3 */
117 /* Final reconstruction.
118 Combine Sin result's sign */
122 /* CosR = CosR - CosN*Pi3 */
126 movups __dPI4(%rax), %xmm6
128 /* SinR = SinR - SinN*Pi4 */
131 /* CosR = CosR - CosN*Pi4 */
144 /* Polynomial approximation */
145 movups __dC7(%rax), %xmm5
149 addpd __dC6(%rax), %xmm3
150 addpd __dC6(%rax), %xmm5
153 addpd __dC5(%rax), %xmm3
154 addpd __dC5(%rax), %xmm5
157 addpd __dC4(%rax), %xmm3
158 addpd __dC4(%rax), %xmm5
160 /* SinPoly = C3 + SinR2*(C4 + SinR2*(C5 + SinR2*(C6 + SinR2*C7))) */
163 /* CosPoly = C3 + CosR2*(C4 + CosR2*(C5 + CosR2*(C6 + CosR2*C7))) */
165 addpd __dC3(%rax), %xmm3
166 addpd __dC3(%rax), %xmm5
168 /* SinPoly = C2 + SinR2*SinPoly */
171 /* CosPoly = C2 + CosR2*CosPoly */
173 addpd __dC2(%rax), %xmm3
174 addpd __dC2(%rax), %xmm5
176 /* SinPoly = C1 + SinR2*SinPoly */
179 /* CosPoly = C1 + CosR2*CosPoly */
181 addpd __dC1(%rax), %xmm3
182 addpd __dC1(%rax), %xmm5
184 /* SinPoly = SinR2*SinPoly */
187 /* CosPoly = CosR2*CosPoly */
190 /* SinPoly = SinR*SinPoly */
193 /* CosPoly = CosR*CosPoly */
198 /* Update Sin result's sign */
201 /* Update Cos result's sign */
208 movups 176(%rsp), %xmm10
209 movaps %xmm13, (%rdi)
210 movups 160(%rsp), %xmm11
211 movups 144(%rsp), %xmm12
212 movups 112(%rsp), %xmm13
215 cfi_def_cfa_register (%rsp)
217 cfi_adjust_cfa_offset (-8)
223 movups %xmm0, 128(%rsp)
224 movups %xmm13, 192(%rsp)
225 movups %xmm1, 256(%rsp)
230 movups %xmm8, 48(%rsp)
231 movups %xmm9, 32(%rsp)
232 movups %xmm14, 16(%rsp)
233 movups %xmm15, (%rsp)
236 cfi_offset_rel_rsp (12, 104)
239 cfi_offset_rel_rsp (13, 96)
242 cfi_offset_rel_rsp (14, 88)
245 cfi_offset_rel_rsp (15, 80)
265 movups 48(%rsp), %xmm8
267 movups 32(%rsp), %xmm9
268 movups 16(%rsp), %xmm14
269 movups (%rsp), %xmm15
280 movups 192(%rsp), %xmm13
281 movups 256(%rsp), %xmm1
288 movsd 136(%rsp,%r15), %xmm0
292 movsd %xmm0, 200(%rsp,%r15)
293 movsd 136(%rsp,%r15), %xmm0
297 movsd %xmm0, 264(%rsp,%r15)
303 movsd 128(%rsp,%r15), %xmm0
307 movsd %xmm0, 192(%rsp,%r15)
308 movsd 128(%rsp,%r15), %xmm0
312 movsd %xmm0, 256(%rsp,%r15)
314 END (_ZGVbN2vl8l8_sincos_sse4)
315 libmvec_hidden_def(_ZGVbN2vl8l8_sincos_sse4)
317 /* vvv version implemented with wrapper to vl8l8 variant. */
318 ENTRY (_ZGVbN2vvv_sincos_sse4)
321 .cfi_def_cfa_offset 80
322 movdqu %xmm1, 32(%rsp)
324 movdqu %xmm2, 48(%rdi)
326 call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
340 .cfi_def_cfa_offset 8
344 .cfi_def_cfa_offset 80
346 movaps %xmm1, 16(%esp)
349 call HIDDEN_JUMPTARGET(_ZGVbN2vl8l8_sincos_sse4)
350 movdqa 16(%esp), %xmm1
351 movsd 32(%esp), %xmm0
355 movsd 40(%esp), %xmm0
356 pextrd $1, %xmm1, %eax
358 movsd 48(%esp), %xmm0
361 movsd 56(%esp), %xmm0
362 pextrd $1, %xmm2, %eax
365 .cfi_def_cfa_offset 8
368 END (_ZGVbN2vvv_sincos_sse4)