initial commit
[glibc.git] / sysdeps / ia64 / fpu / libm_tan.S
1 .file "libm_tan.s"
2
3 // Copyright (C) 2000, 2001, Intel Corporation
4 // All rights reserved.
5 //
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are
9 // met:
10 //
11 // * Redistributions of source code must retain the above copyright
12 // notice, this list of conditions and the following disclaimer.
13 //
14 // * Redistributions in binary form must reproduce the above copyright
15 // notice, this list of conditions and the following disclaimer in the
16 // documentation and/or other materials provided with the distribution.
17 //
18 // * The name of Intel Corporation may not be used to endorse or promote
19 // products derived from this software without specific prior written
20 // permission.
21 //
22 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
26 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
27 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
28 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
30 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
31 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 //
34 // Intel Corporation is the author of this code, and requests that all
35 // problem reports or change requests be submitted to it directly at
36 // http://developer.intel.com/opensource.
37 //
38 // *********************************************************************
39 //
40 // History:
41 // 02/02/00 Initial Version
42 // 4/04/00 Unwind support added
43 // 12/28/00 Fixed false invalid flags
44 //
45 // *********************************************************************
46 //
47 // Function: tan(x) = tangent(x), for double precision x values
48 //
49 // *********************************************************************
50 //
51 // Accuracy: Very accurate for double-precision values
52 //
53 // *********************************************************************
54 //
55 // Resources Used:
56 //
57 // Floating-Point Registers: f8 (Input and Return Value)
58 // f9-f15
59 // f32-f112
60 //
61 // General Purpose Registers:
62 // r32-r48
63 // r49-r50 (Used to pass arguments to pi_by_2 reduce routine)
64 //
65 // Predicate Registers: p6-p15
66 //
67 // *********************************************************************
68 //
69 // IEEE Special Conditions:
70 //
71 // Denormal fault raised on denormal inputs
72 // Overflow exceptions do not occur
73 // Underflow exceptions raised when appropriate for tan
74 // (No specialized error handling for this routine)
75 // Inexact raised when appropriate by algorithm
76 //
77 // tan(SNaN) = QNaN
78 // tan(QNaN) = QNaN
79 // tan(inf) = QNaN
80 // tan(+/-0) = +/-0
81 //
82 // *********************************************************************
83 //
84 // Mathematical Description
85 //
86 // We consider the computation of FPTAN of Arg. Now, given
87 //
88 // Arg = N pi/2 + alpha, |alpha| <= pi/4,
89 //
90 // basic mathematical relationship shows that
91 //
92 // tan( Arg ) = tan( alpha ) if N is even;
93 // = -cot( alpha ) otherwise.
94 //
95 // The value of alpha is obtained by argument reduction and
96 // represented by two working precision numbers r and c where
97 //
98 // alpha = r + c accurately.
99 //
100 // The reduction method is described in a previous write up.
101 // The argument reduction scheme identifies 4 cases. For Cases 2
102 // and 4, because |alpha| is small, tan(r+c) and -cot(r+c) can be
103 // computed very easily by 2 or 3 terms of the Taylor series
104 // expansion as follows:
105 //
106 // Case 2:
107 // -------
108 //
109 // tan(r + c) = r + c + r^3/3 ...accurately
110 // -cot(r + c) = -1/(r+c) + r/3 ...accurately
111 //
112 // Case 4:
113 // -------
114 //
115 // tan(r + c) = r + c + r^3/3 + 2r^5/15 ...accurately
116 // -cot(r + c) = -1/(r+c) + r/3 + r^3/45 ...accurately
117 //
118 //
119 // The only cases left are Cases 1 and 3 of the argument reduction
120 // procedure. These two cases will be merged since after the
121 // argument is reduced in either cases, we have the reduced argument
122 // represented as r + c and that the magnitude |r + c| is not small
123 // enough to allow the usage of a very short approximation.
124 //
125 // The greatest challenge of this task is that the second terms of
126 // the Taylor series for tan(r) and -cot(r)
127 //
128 // r + r^3/3 + 2 r^5/15 + ...
129 //
130 // and
131 //
132 // -1/r + r/3 + r^3/45 + ...
133 //
134 // are not very small when |r| is close to pi/4 and the rounding
135 // errors will be a concern if simple polynomial accumulation is
136 // used. When |r| < 2^(-2), however, the second terms will be small
137 // enough (5 bits or so of right shift) that a normal Horner
138 // recurrence suffices. Hence there are two cases that we consider
139 // in the accurate computation of tan(r) and cot(r), |r| <= pi/4.
140 //
141 // Case small_r: |r| < 2^(-2)
142 // --------------------------
143 //
144 // Since Arg = N pi/4 + r + c accurately, we have
145 //
146 // tan(Arg) = tan(r+c) for N even,
147 // = -cot(r+c) otherwise.
148 //
149 // Here for this case, both tan(r) and -cot(r) can be approximated
150 // by simple polynomials:
151 //
152 // tan(r) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
153 // -cot(r) = -1/r + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
154 //
155 // accurately. Since |r| is relatively small, tan(r+c) and
156 // -cot(r+c) can be accurately approximated by replacing r with
157 // r+c only in the first two terms of the corresponding polynomials.
158 //
159 // Note that P1_1 (and Q1_1 for that matter) approximates 1/3 to
160 // almost 64 sig. bits, thus
161 //
162 // P1_1 (r+c)^3 = P1_1 r^3 + c * r^2 accurately.
163 //
164 // Hence,
165 //
166 // tan(r+c) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
167 // + c*(1 + r^2)
168 //
169 // -cot(r+c) = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
170 // + Q1_1*c
171 //
172 //
173 // Case normal_r: 2^(-2) <= |r| <= pi/4
174 // ------------------------------------
175 //
176 // This case is more likely than the previous one if one considers
177 // r to be uniformly distributed in [-pi/4 pi/4].
178 //
179 // The required calculation is either
180 //
181 // tan(r + c) = tan(r) + correction, or
182 // -cot(r + c) = -cot(r) + correction.
183 //
184 // Specifically,
185 //
186 // tan(r + c) = tan(r) + c tan'(r) + O(c^2)
187 // = tan(r) + c sec^2(r) + O(c^2)
188 // = tan(r) + c SEC_sq ...accurately
189 // as long as SEC_sq approximates sec^2(r)
190 // to, say, 5 bits or so.
191 //
192 // Similarly,
193 //
194 // -cot(r + c) = -cot(r) - c cot'(r) + O(c^2)
195 // = -cot(r) + c csc^2(r) + O(c^2)
196 // = -cot(r) + c CSC_sq ...accurately
197 // as long as CSC_sq approximates csc^2(r)
198 // to, say, 5 bits or so.
199 //
200 // We therefore concentrate on accurately calculating tan(r) and
201 // cot(r) for a working-precision number r, |r| <= pi/4 to within
202 // 0.1% or so.
203 //
204 // We will employ a table-driven approach. Let
205 //
206 // r = sgn_r * 2^k * 1.b_1 b_2 ... b_5 ... b_63
207 // = sgn_r * ( B + x )
208 //
209 // where
210 //
211 // B = 2^k * 1.b_1 b_2 ... b_5 1
212 // x = |r| - B
213 //
214 // Now,
215 // tan(B) + tan(x)
216 // tan( B + x ) = ------------------------
217 // 1 - tan(B)*tan(x)
218 //
219 // / \
220 // | tan(B) + tan(x) |
221
222 // = tan(B) + | ------------------------ - tan(B) |
223 // | 1 - tan(B)*tan(x) |
224 // \ /
225 //
226 // sec^2(B) * tan(x)
227 // = tan(B) + ------------------------
228 // 1 - tan(B)*tan(x)
229 //
230 // (1/[sin(B)*cos(B)]) * tan(x)
231 // = tan(B) + --------------------------------
232 // cot(B) - tan(x)
233 //
234 //
235 // Clearly, the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
236 // calculated beforehand and stored in a table. Since
237 //
238 // |x| <= 2^k * 2^(-6) <= 2^(-7) (because k = -1, -2)
239 //
240 // a very short polynomial will be sufficient to approximate tan(x)
241 // accurately. The details involved in computing the last expression
242 // will be given in the next section on algorithm description.
243 //
244 //
245 // Now, we turn to the case where cot( B + x ) is needed.
246 //
247 //
248 // 1 - tan(B)*tan(x)
249 // cot( B + x ) = ------------------------
250 // tan(B) + tan(x)
251 //
252 // / \
253 // | 1 - tan(B)*tan(x) |
254
255 // = cot(B) + | ----------------------- - cot(B) |
256 // | tan(B) + tan(x) |
257 // \ /
258 //
259 // [tan(B) + cot(B)] * tan(x)
260 // = cot(B) - ----------------------------
261 // tan(B) + tan(x)
262 //
263 // (1/[sin(B)*cos(B)]) * tan(x)
264 // = cot(B) - --------------------------------
265 // tan(B) + tan(x)
266 //
267 //
268 // Note that the values of tan(B), cot(B) and 1/(sin(B)*cos(B)) that
269 // are needed are the same set of values needed in the previous
270 // case.
271 //
272 // Finally, we can put all the ingredients together as follows:
273 //
274 // Arg = N * pi/2 + r + c ...accurately
275 //
276 // tan(Arg) = tan(r) + correction if N is even;
277 // = -cot(r) + correction otherwise.
278 //
279 // For Cases 2 and 4,
280 //
281 // Case 2:
282 // tan(Arg) = tan(r + c) = r + c + r^3/3 N even
283 // = -cot(r + c) = -1/(r+c) + r/3 N odd
284 // Case 4:
285 // tan(Arg) = tan(r + c) = r + c + r^3/3 + 2r^5/15 N even
286 // = -cot(r + c) = -1/(r+c) + r/3 + r^3/45 N odd
287 //
288 //
289 // For Cases 1 and 3,
290 //
291 // Case small_r: |r| < 2^(-2)
292 //
293 // tan(Arg) = r + P1_1 r^3 + P1_2 r^5 + ... + P1_9 r^19
294 // + c*(1 + r^2) N even
295 //
296 // = -1/(r+c) + Q1_1 r + Q1_2 r^3 + ... + Q1_7 r^13
297 // + Q1_1*c N odd
298 //
299 // Case normal_r: 2^(-2) <= |r| <= pi/4
300 //
301 // tan(Arg) = tan(r) + c * sec^2(r) N even
302 // = -cot(r) + c * csc^2(r) otherwise
303 //
304 // For N even,
305 //
306 // tan(Arg) = tan(r) + c*sec^2(r)
307 // = tan( sgn_r * (B+x) ) + c * sec^2(|r|)
308 // = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(|r|) )
309 // = sgn_r * ( tan(B+x) + sgn_r*c*sec^2(B) )
310 //
311 // since B approximates |r| to 2^(-6) in relative accuracy.
312 //
313 // / (1/[sin(B)*cos(B)]) * tan(x)
314 // tan(Arg) = sgn_r * | tan(B) + --------------------------------
315 // \ cot(B) - tan(x)
316 // \
317 // + CORR |
318
319 // /
320 // where
321 //
322 // CORR = sgn_r*c*tan(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
323 //
324 // For N odd,
325 //
326 // tan(Arg) = -cot(r) + c*csc^2(r)
327 // = -cot( sgn_r * (B+x) ) + c * csc^2(|r|)
328 // = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(|r|) )
329 // = sgn_r * ( -cot(B+x) + sgn_r*c*csc^2(B) )
330 //
331 // since B approximates |r| to 2^(-6) in relative accuracy.
332 //
333 // / (1/[sin(B)*cos(B)]) * tan(x)
334 // tan(Arg) = sgn_r * | -cot(B) + --------------------------------
335 // \ tan(B) + tan(x)
336 // \
337 // + CORR |
338
339 // /
340 // where
341 //
342 // CORR = sgn_r*c*cot(B)*SC_inv(B); SC_inv(B) = 1/(sin(B)*cos(B)).
343 //
344 //
345 // The actual algorithm prescribes how all the mathematical formulas
346 // are calculated.
347 //
348 //
349 // 2. Algorithmic Description
350 // ==========================
351 //
352 // 2.1 Computation for Cases 2 and 4.
353 // ----------------------------------
354 //
355 // For Case 2, we use two-term polynomials.
356 //
357 // For N even,
358 //
359 // rsq := r * r
360 // Result := c + r * rsq * P1_1
361 // Result := r + Result ...in user-defined rounding
362 //
363 // For N odd,
364 // S_hi := -frcpa(r) ...8 bits
365 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
366 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
367 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
368 // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
369 // ...S_hi + S_lo is -1/(r+c) to extra precision
370 // S_lo := S_lo + Q1_1*r
371 //
372 // Result := S_hi + S_lo ...in user-defined rounding
373 //
374 // For Case 4, we use three-term polynomials
375 //
376 // For N even,
377 //
378 // rsq := r * r
379 // Result := c + r * rsq * (P1_1 + rsq * P1_2)
380 // Result := r + Result ...in user-defined rounding
381 //
382 // For N odd,
383 // S_hi := -frcpa(r) ...8 bits
384 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
385 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
386 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
387 // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
388 // ...S_hi + S_lo is -1/(r+c) to extra precision
389 // rsq := r * r
390 // P := Q1_1 + rsq*Q1_2
391 // S_lo := S_lo + r*P
392 //
393 // Result := S_hi + S_lo ...in user-defined rounding
394 //
395 //
396 // Note that the coefficients P1_1, P1_2, Q1_1, and Q1_2 are
397 // the same as those used in the small_r case of Cases 1 and 3
398 // below.
399 //
400 //
401 // 2.2 Computation for Cases 1 and 3.
402 // ----------------------------------
403 // This is further divided into the case of small_r,
404 // where |r| < 2^(-2), and the case of normal_r, where |r| lies between
405 // 2^(-2) and pi/4.
406 //
407 // Algorithm for the case of small_r
408 // ---------------------------------
409 //
410 // For N even,
411 // rsq := r * r
412 // Poly1 := rsq*(P1_1 + rsq*(P1_2 + rsq*P1_3))
413 // r_to_the_8 := rsq * rsq
414 // r_to_the_8 := r_to_the_8 * r_to_the_8
415 // Poly2 := P1_4 + rsq*(P1_5 + rsq*(P1_6 + ... rsq*P1_9))
416 // CORR := c * ( 1 + rsq )
417 // Poly := Poly1 + r_to_the_8*Poly2
418 // Result := r*Poly + CORR
419 // Result := r + Result ...in user-defined rounding
420 // ...note that Poly1 and r_to_the_8 can be computed in parallel
421 // ...with Poly2 (Poly1 is intentionally set to be much
422 // ...shorter than Poly2 so that r_to_the_8 and CORR can be hidden)
423 //
424 // For N odd,
425 // S_hi := -frcpa(r) ...8 bits
426 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...16 bits
427 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...32 bits
428 // S_hi := S_hi + S_hi*(1 + S_hi*r) ...64 bits
429 // S_lo := S_hi*( (1 + S_hi*r) + S_hi*c )
430 // ...S_hi + S_lo is -1/(r+c) to extra precision
431 // S_lo := S_lo + Q1_1*c
432 //
433 // ...S_hi and S_lo are computed in parallel with
434 // ...the following
435 // rsq := r*r
436 // P := Q1_1 + rsq*(Q1_2 + rsq*(Q1_3 + ... + rsq*Q1_7))
437 //
438 // Result := r*P + S_lo
439 // Result := S_hi + Result ...in user-defined rounding
440 //
441 //
442 // Algorithm for the case of normal_r
443 // ----------------------------------
444 //
445 // Here, we first consider the computation of tan( r + c ). As
446 // presented in the previous section,
447 //
448 // tan( r + c ) = tan(r) + c * sec^2(r)
449 // = sgn_r * [ tan(B+x) + CORR ]
450 // CORR = sgn_r * c * tan(B) * 1/[sin(B)*cos(B)]
451 //
452 // because sec^2(r) = sec^(|r|), and B approximate |r| to 6.5 bits.
453 //
454 // tan( r + c ) =
455 // / (1/[sin(B)*cos(B)]) * tan(x)
456 // sgn_r * | tan(B) + -------------------------------- +
457 // \ cot(B) - tan(x)
458 // \
459 // CORR |
460
461 // /
462 //
463 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
464 // calculated beforehand and stored in a table. Specifically,
465 // the table values are
466 //
467 // tan(B) as T_hi + T_lo;
468 // cot(B) as C_hi + C_lo;
469 // 1/[sin(B)*cos(B)] as SC_inv
470 //
471 // T_hi, C_hi are in double-precision memory format;
472 // T_lo, C_lo are in single-precision memory format;
473 // SC_inv is in extended-precision memory format.
474 //
475 // The value of tan(x) will be approximated by a short polynomial of
476 // the form
477 //
478 // tan(x) as x + x * P, where
479 // P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
480 //
481 // Because |x| <= 2^(-7), cot(B) - x approximates cot(B) - tan(x)
482 // to a relative accuracy better than 2^(-20). Thus, a good
483 // initial guess of 1/( cot(B) - tan(x) ) to initiate the iterative
484 // division is:
485 //
486 // 1/(cot(B) - tan(x)) is approximately
487 // 1/(cot(B) - x) is
488 // tan(B)/(1 - x*tan(B)) is approximately
489 // T_hi / ( 1 - T_hi * x ) is approximately
490 //
491 // T_hi * [ 1 + (Thi * x) + (T_hi * x)^2 ]
492 //
493 // The calculation of tan(r+c) therefore proceed as follows:
494 //
495 // Tx := T_hi * x
496 // xsq := x * x
497 //
498 // V_hi := T_hi*(1 + Tx*(1 + Tx))
499 // P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
500 // ...V_hi serves as an initial guess of 1/(cot(B) - tan(x))
501 // ...good to about 20 bits of accuracy
502 //
503 // tanx := x + x*P
504 // D := C_hi - tanx
505 // ...D is a double precision denominator: cot(B) - tan(x)
506 //
507 // V_hi := V_hi + V_hi*(1 - V_hi*D)
508 // ....V_hi approximates 1/(cot(B)-tan(x)) to 40 bits
509 //
510 // V_lo := V_hi * ( [ (1 - V_hi*C_hi) + V_hi*tanx ]
511 // - V_hi*C_lo ) ...observe all order
512 // ...V_hi + V_lo approximates 1/(cot(B) - tan(x))
513 // ...to extra accuracy
514 //
515 // ... SC_inv(B) * (x + x*P)
516 // ... tan(B) + ------------------------- + CORR
517 // ... cot(B) - (x + x*P)
518 // ...
519 // ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
520 // ...
521 //
522 // Sx := SC_inv * x
523 // CORR := sgn_r * c * SC_inv * T_hi
524 //
525 // ...put the ingredients together to compute
526 // ... SC_inv(B) * (x + x*P)
527 // ... tan(B) + ------------------------- + CORR
528 // ... cot(B) - (x + x*P)
529 // ...
530 // ... = tan(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
531 // ...
532 // ... = T_hi + T_lo + CORR +
533 // ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
534 //
535 // CORR := CORR + T_lo
536 // tail := V_lo + P*(V_hi + V_lo)
537 // tail := Sx * tail + CORR
538 // tail := Sx * V_hi + tail
539 // T_hi := sgn_r * T_hi
540 //
541 // ...T_hi + sgn_r*tail now approximate
542 // ...sgn_r*(tan(B+x) + CORR) accurately
543 //
544 // Result := T_hi + sgn_r*tail ...in user-defined
545 // ...rounding control
546 // ...It is crucial that independent paths be fully
547 // ...exploited for performance's sake.
548 //
549 //
550 // Next, we consider the computation of -cot( r + c ). As
551 // presented in the previous section,
552 //
553 // -cot( r + c ) = -cot(r) + c * csc^2(r)
554 // = sgn_r * [ -cot(B+x) + CORR ]
555 // CORR = sgn_r * c * cot(B) * 1/[sin(B)*cos(B)]
556 //
557 // because csc^2(r) = csc^(|r|), and B approximate |r| to 6.5 bits.
558 //
559 // -cot( r + c ) =
560 // / (1/[sin(B)*cos(B)]) * tan(x)
561 // sgn_r * | -cot(B) + -------------------------------- +
562 // \ tan(B) + tan(x)
563 // \
564 // CORR |
565
566 // /
567 //
568 // The values of tan(B), cot(B) and 1/(sin(B)*cos(B)) are
569 // calculated beforehand and stored in a table. Specifically,
570 // the table values are
571 //
572 // tan(B) as T_hi + T_lo;
573 // cot(B) as C_hi + C_lo;
574 // 1/[sin(B)*cos(B)] as SC_inv
575 //
576 // T_hi, C_hi are in double-precision memory format;
577 // T_lo, C_lo are in single-precision memory format;
578 // SC_inv is in extended-precision memory format.
579 //
580 // The value of tan(x) will be approximated by a short polynomial of
581 // the form
582 //
583 // tan(x) as x + x * P, where
584 // P = x^2 * (P2_1 + x^2 * (P2_2 + x^2 * P2_3))
585 //
586 // Because |x| <= 2^(-7), tan(B) + x approximates tan(B) + tan(x)
587 // to a relative accuracy better than 2^(-18). Thus, a good
588 // initial guess of 1/( tan(B) + tan(x) ) to initiate the iterative
589 // division is:
590 //
591 // 1/(tan(B) + tan(x)) is approximately
592 // 1/(tan(B) + x) is
593 // cot(B)/(1 + x*cot(B)) is approximately
594 // C_hi / ( 1 + C_hi * x ) is approximately
595 //
596 // C_hi * [ 1 - (C_hi * x) + (C_hi * x)^2 ]
597 //
598 // The calculation of -cot(r+c) therefore proceed as follows:
599 //
600 // Cx := C_hi * x
601 // xsq := x * x
602 //
603 // V_hi := C_hi*(1 - Cx*(1 - Cx))
604 // P := xsq * (P1_1 + xsq*(P1_2 + xsq*P1_3))
605 // ...V_hi serves as an initial guess of 1/(tan(B) + tan(x))
606 // ...good to about 18 bits of accuracy
607 //
608 // tanx := x + x*P
609 // D := T_hi + tanx
610 // ...D is a double precision denominator: tan(B) + tan(x)
611 //
612 // V_hi := V_hi + V_hi*(1 - V_hi*D)
613 // ....V_hi approximates 1/(tan(B)+tan(x)) to 40 bits
614 //
615 // V_lo := V_hi * ( [ (1 - V_hi*T_hi) - V_hi*tanx ]
616 // - V_hi*T_lo ) ...observe all order
617 // ...V_hi + V_lo approximates 1/(tan(B) + tan(x))
618 // ...to extra accuracy
619 //
620 // ... SC_inv(B) * (x + x*P)
621 // ... -cot(B) + ------------------------- + CORR
622 // ... tan(B) + (x + x*P)
623 // ...
624 // ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
625 // ...
626 //
627 // Sx := SC_inv * x
628 // CORR := sgn_r * c * SC_inv * C_hi
629 //
630 // ...put the ingredients together to compute
631 // ... SC_inv(B) * (x + x*P)
632 // ... -cot(B) + ------------------------- + CORR
633 // ... tan(B) + (x + x*P)
634 // ...
635 // ... =-cot(B) + SC_inv(B)*(x + x*P)*(V_hi + V_lo) + CORR
636 // ...
637 // ... =-C_hi - C_lo + CORR +
638 // ... Sx * V_hi + Sx * V_lo + Sx * P *(V_hi + V_lo)
639 //
640 // CORR := CORR - C_lo
641 // tail := V_lo + P*(V_hi + V_lo)
642 // tail := Sx * tail + CORR
643 // tail := Sx * V_hi + tail
644 // C_hi := -sgn_r * C_hi
645 //
646 // ...C_hi + sgn_r*tail now approximates
647 // ...sgn_r*(-cot(B+x) + CORR) accurately
648 //
649 // Result := C_hi + sgn_r*tail in user-defined rounding control
650 // ...It is crucial that independent paths be fully
651 // ...exploited for performance's sake.
652 //
653 // 3. Implementation Notes
654 // =======================
655 //
656 // Table entries T_hi, T_lo; C_hi, C_lo; SC_inv
657 //
658 // Recall that 2^(-2) <= |r| <= pi/4;
659 //
660 // r = sgn_r * 2^k * 1.b_1 b_2 ... b_63
661 //
662 // and
663 //
664 // B = 2^k * 1.b_1 b_2 b_3 b_4 b_5 1
665 //
666 // Thus, for k = -2, possible values of B are
667 //
668 // B = 2^(-2) * ( 1 + index/32 + 1/64 ),
669 // index ranges from 0 to 31
670 //
671 // For k = -1, however, since |r| <= pi/4 = 0.78...
672 // possible values of B are
673 //
674 // B = 2^(-1) * ( 1 + index/32 + 1/64 )
675 // index ranges from 0 to 19.
676 //
677 //
678
679 #include "libm_support.h"
680
681 #ifdef _LIBC
682 .rodata
683 #else
684 .data
685 #endif
686
687 .align 128
688
689 TAN_BASE_CONSTANTS:
690 .type TAN_BASE_CONSTANTS, @object
691 data4 0x4B800000, 0xCB800000, 0x38800000, 0xB8800000 // two**24, -two**24
692 // two**-14, -two**-14
693 data4 0x4E44152A, 0xA2F9836E, 0x00003FFE, 0x00000000 // two_by_pi
694 data4 0xCE81B9F1, 0xC84D32B0, 0x00004016, 0x00000000 // P_0
695 data4 0x2168C235, 0xC90FDAA2, 0x00003FFF, 0x00000000 // P_1
696 data4 0xFC8F8CBB, 0xECE675D1, 0x0000BFBD, 0x00000000 // P_2
697 data4 0xACC19C60, 0xB7ED8FBB, 0x0000BF7C, 0x00000000 // P_3
698 data4 0x5F000000, 0xDF000000, 0x00000000, 0x00000000 // two_to_63, -two_to_63
699 data4 0x6EC6B45A, 0xA397E504, 0x00003FE7, 0x00000000 // Inv_P_0
700 data4 0xDBD171A1, 0x8D848E89, 0x0000BFBF, 0x00000000 // d_1
701 data4 0x18A66F8E, 0xD5394C36, 0x0000BF7C, 0x00000000 // d_2
702 data4 0x2168C234, 0xC90FDAA2, 0x00003FFE, 0x00000000 // PI_BY_4
703 data4 0x2168C234, 0xC90FDAA2, 0x0000BFFE, 0x00000000 // MPI_BY_4
704 data4 0x3E800000, 0xBE800000, 0x00000000, 0x00000000 // two**-2, -two**-2
705 data4 0x2F000000, 0xAF000000, 0x00000000, 0x00000000 // two**-33, -two**-33
706 data4 0xAAAAAABD, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P1_1
707 data4 0x88882E6A, 0x88888888, 0x00003FFC, 0x00000000 // P1_2
708 data4 0x0F0177B6, 0xDD0DD0DD, 0x00003FFA, 0x00000000 // P1_3
709 data4 0x646B8C6D, 0xB327A440, 0x00003FF9, 0x00000000 // P1_4
710 data4 0x1D5F7D20, 0x91371B25, 0x00003FF8, 0x00000000 // P1_5
711 data4 0x61C67914, 0xEB69A5F1, 0x00003FF6, 0x00000000 // P1_6
712 data4 0x019318D2, 0xBEDD37BE, 0x00003FF5, 0x00000000 // P1_7
713 data4 0x3C794015, 0x9979B146, 0x00003FF4, 0x00000000 // P1_8
714 data4 0x8C6EB58A, 0x8EBD21A3, 0x00003FF3, 0x00000000 // P1_9
715 data4 0xAAAAAAB4, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // Q1_1
716 data4 0x0B5FC93E, 0xB60B60B6, 0x00003FF9, 0x00000000 // Q1_2
717 data4 0x0C9BBFBF, 0x8AB355E0, 0x00003FF6, 0x00000000 // Q1_3
718 data4 0xCBEE3D4C, 0xDDEBBC89, 0x00003FF2, 0x00000000 // Q1_4
719 data4 0x5F80BBB6, 0xB3548A68, 0x00003FEF, 0x00000000 // Q1_5
720 data4 0x4CED5BF1, 0x91362560, 0x00003FEC, 0x00000000 // Q1_6
721 data4 0x8EE92A83, 0xF189D95A, 0x00003FE8, 0x00000000 // Q1_7
722 data4 0xAAAB362F, 0xAAAAAAAA, 0x00003FFD, 0x00000000 // P2_1
723 data4 0xE97A6097, 0x88888886, 0x00003FFC, 0x00000000 // P2_2
724 data4 0x25E716A1, 0xDD108EE0, 0x00003FFA, 0x00000000 // P2_3
725 //
726 // Entries T_hi double-precision memory format
727 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
728 // Entries T_lo single-precision memory format
729 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
730 //
731 data4 0x62400794, 0x3FD09BC3, 0x23A05C32, 0x00000000
732 data4 0xDFFBC074, 0x3FD124A9, 0x240078B2, 0x00000000
733 data4 0x5BD4920F, 0x3FD1AE23, 0x23826B8E, 0x00000000
734 data4 0x15E2701D, 0x3FD23835, 0x22D31154, 0x00000000
735 data4 0x63739C2D, 0x3FD2C2E4, 0x2265C9E2, 0x00000000
736 data4 0xAFEEA48B, 0x3FD34E36, 0x245C05EB, 0x00000000
737 data4 0x7DBB35D1, 0x3FD3DA31, 0x24749F2D, 0x00000000
738 data4 0x67321619, 0x3FD466DA, 0x2462CECE, 0x00000000
739 data4 0x1F94A4D5, 0x3FD4F437, 0x246D0DF1, 0x00000000
740 data4 0x740C3E6D, 0x3FD5824D, 0x240A85B5, 0x00000000
741 data4 0x4CB1E73D, 0x3FD61123, 0x23F96E33, 0x00000000
742 data4 0xAD9EA64B, 0x3FD6A0BE, 0x247C5393, 0x00000000
743 data4 0xB804FD01, 0x3FD73125, 0x241F3B29, 0x00000000
744 data4 0xAB53EE83, 0x3FD7C25E, 0x2479989B, 0x00000000
745 data4 0xE6640EED, 0x3FD8546F, 0x23B343BC, 0x00000000
746 data4 0xE8AF1892, 0x3FD8E75F, 0x241454D1, 0x00000000
747 data4 0x53928BDA, 0x3FD97B35, 0x238613D9, 0x00000000
748 data4 0xEB9DE4DE, 0x3FDA0FF6, 0x22859FA7, 0x00000000
749 data4 0x99ECF92D, 0x3FDAA5AB, 0x237A6D06, 0x00000000
750 data4 0x6D8F1796, 0x3FDB3C5A, 0x23952F6C, 0x00000000
751 data4 0x9CFB8BE4, 0x3FDBD40A, 0x2280FC95, 0x00000000
752 data4 0x87943100, 0x3FDC6CC3, 0x245D2EC0, 0x00000000
753 data4 0xB736C500, 0x3FDD068C, 0x23C4AD7D, 0x00000000
754 data4 0xE1DDBC31, 0x3FDDA16D, 0x23D076E6, 0x00000000
755 data4 0xEB515A93, 0x3FDE3D6E, 0x244809A6, 0x00000000
756 data4 0xE6E9E5F1, 0x3FDEDA97, 0x220856C8, 0x00000000
757 data4 0x1963CE69, 0x3FDF78F1, 0x244BE993, 0x00000000
758 data4 0x7D635BCE, 0x3FE00C41, 0x23D21799, 0x00000000
759 data4 0x1C302CD3, 0x3FE05CAB, 0x248A1B1D, 0x00000000
760 data4 0xDB6A1FA0, 0x3FE0ADB9, 0x23D53E33, 0x00000000
761 data4 0x4A20BA81, 0x3FE0FF72, 0x24DB9ED5, 0x00000000
762 data4 0x153FA6F5, 0x3FE151D9, 0x24E9E451, 0x00000000
763 //
764 // Entries T_hi double-precision memory format
765 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
766 // Entries T_lo single-precision memory format
767 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
768 //
769 data4 0xBA1BE39E, 0x3FE1CEC4, 0x24B60F9E, 0x00000000
770 data4 0x5ABD9B2D, 0x3FE277E4, 0x248C2474, 0x00000000
771 data4 0x0272B110, 0x3FE32418, 0x247B8311, 0x00000000
772 data4 0x890E2DF0, 0x3FE3D38B, 0x24C55751, 0x00000000
773 data4 0x46236871, 0x3FE4866D, 0x24E5BC34, 0x00000000
774 data4 0x45E044B0, 0x3FE53CEE, 0x24001BA4, 0x00000000
775 data4 0x82EC06E4, 0x3FE5F742, 0x24B973DC, 0x00000000
776 data4 0x25DF43F9, 0x3FE6B5A1, 0x24895440, 0x00000000
777 data4 0xCAFD348C, 0x3FE77844, 0x240021CA, 0x00000000
778 data4 0xCEED6B92, 0x3FE83F6B, 0x24C45372, 0x00000000
779 data4 0xA34F3665, 0x3FE90B58, 0x240DAD33, 0x00000000
780 data4 0x2C1E56B4, 0x3FE9DC52, 0x24F846CE, 0x00000000
781 data4 0x27041578, 0x3FEAB2A4, 0x2323FB6E, 0x00000000
782 data4 0x9DD8C373, 0x3FEB8E9F, 0x24B3090B, 0x00000000
783 data4 0x65C9AA7B, 0x3FEC709B, 0x2449F611, 0x00000000
784 data4 0xACCF8435, 0x3FED58F4, 0x23616A7E, 0x00000000
785 data4 0x97635082, 0x3FEE480F, 0x24C2FEAE, 0x00000000
786 data4 0xF0ACC544, 0x3FEF3E57, 0x242CE964, 0x00000000
787 data4 0xF7E06E4B, 0x3FF01E20, 0x2480D3EE, 0x00000000
788 data4 0x8A798A69, 0x3FF0A125, 0x24DB8967, 0x00000000
789 //
790 // Entries C_hi double-precision memory format
791 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
792 // Entries C_lo single-precision memory format
793 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
794 //
795 data4 0xE63EFBD0, 0x400ED3E2, 0x259D94D4, 0x00000000
796 data4 0xC515DAB5, 0x400DDDB4, 0x245F0537, 0x00000000
797 data4 0xBE19A79F, 0x400CF57A, 0x25D4EA9F, 0x00000000
798 data4 0xD15298ED, 0x400C1A06, 0x24AE40A0, 0x00000000
799 data4 0x164B2708, 0x400B4A4C, 0x25A5AAB6, 0x00000000
800 data4 0x5285B068, 0x400A855A, 0x25524F18, 0x00000000
801 data4 0x3FFA549F, 0x4009CA5A, 0x24C999C0, 0x00000000
802 data4 0x646AF623, 0x4009188A, 0x254FD801, 0x00000000
803 data4 0x6084D0E7, 0x40086F3C, 0x2560F5FD, 0x00000000
804 data4 0xA29A76EE, 0x4007CDD2, 0x255B9D19, 0x00000000
805 data4 0x6C8ECA95, 0x400733BE, 0x25CB021B, 0x00000000
806 data4 0x1F8DDC52, 0x4006A07E, 0x24AB4722, 0x00000000
807 data4 0xC298AD58, 0x4006139B, 0x252764E2, 0x00000000
808 data4 0xBAD7164B, 0x40058CAB, 0x24DAF5DB, 0x00000000
809 data4 0xAE31A5D3, 0x40050B4B, 0x25EA20F4, 0x00000000
810 data4 0x89F85A8A, 0x40048F21, 0x2583A3E8, 0x00000000
811 data4 0xA862380D, 0x400417DA, 0x25DCC4CC, 0x00000000
812 data4 0x1088FCFE, 0x4003A52B, 0x2430A492, 0x00000000
813 data4 0xCD3527D5, 0x400336CC, 0x255F77CF, 0x00000000
814 data4 0x5760766D, 0x4002CC7F, 0x25DA0BDA, 0x00000000
815 data4 0x11CE02E3, 0x40026607, 0x256FF4A2, 0x00000000
816 data4 0xD37BBE04, 0x4002032C, 0x25208AED, 0x00000000
817 data4 0x7F050775, 0x4001A3BD, 0x24B72DD6, 0x00000000
818 data4 0xA554848A, 0x40014789, 0x24AB4DAA, 0x00000000
819 data4 0x323E81B7, 0x4000EE65, 0x2584C440, 0x00000000
820 data4 0x21CF1293, 0x40009827, 0x25C9428D, 0x00000000
821 data4 0x3D415EEB, 0x400044A9, 0x25DC8482, 0x00000000
822 data4 0xBD72C577, 0x3FFFE78F, 0x257F5070, 0x00000000
823 data4 0x75EFD28E, 0x3FFF4AC3, 0x23EBBF7A, 0x00000000
824 data4 0x60B52DDE, 0x3FFEB2AF, 0x22EECA07, 0x00000000
825 data4 0x35204180, 0x3FFE1F19, 0x24191079, 0x00000000
826 data4 0x54F7E60A, 0x3FFD8FCA, 0x248D3058, 0x00000000
827 //
828 // Entries C_hi double-precision memory format
829 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
830 // Entries C_lo single-precision memory format
831 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
832 //
833 data4 0x79F6FADE, 0x3FFCC06A, 0x239C7886, 0x00000000
834 data4 0x891662A6, 0x3FFBB91F, 0x250BD191, 0x00000000
835 data4 0x529F155D, 0x3FFABFB6, 0x256CC3E6, 0x00000000
836 data4 0x2E964AE9, 0x3FF9D300, 0x250843E3, 0x00000000
837 data4 0x89DCB383, 0x3FF8F1EF, 0x2277C87E, 0x00000000
838 data4 0x7C87DBD6, 0x3FF81B93, 0x256DA6CF, 0x00000000
839 data4 0x1042EDE4, 0x3FF74F14, 0x2573D28A, 0x00000000
840 data4 0x1784B360, 0x3FF68BAF, 0x242E489A, 0x00000000
841 data4 0x7C923C4C, 0x3FF5D0B5, 0x2532D940, 0x00000000
842 data4 0xF418EF20, 0x3FF51D88, 0x253C7DD6, 0x00000000
843 data4 0x02F88DAE, 0x3FF4719A, 0x23DB59BF, 0x00000000
844 data4 0x49DA0788, 0x3FF3CC66, 0x252B4756, 0x00000000
845 data4 0x0B980DB8, 0x3FF32D77, 0x23FE585F, 0x00000000
846 data4 0xE56C987A, 0x3FF2945F, 0x25378A63, 0x00000000
847 data4 0xB16523F6, 0x3FF200BD, 0x247BB2E0, 0x00000000
848 data4 0x8CE27778, 0x3FF17235, 0x24446538, 0x00000000
849 data4 0xFDEFE692, 0x3FF0E873, 0x2514638F, 0x00000000
850 data4 0x33154062, 0x3FF0632C, 0x24A7FC27, 0x00000000
851 data4 0xB3EF115F, 0x3FEFC42E, 0x248FD0FE, 0x00000000
852 data4 0x135D26F6, 0x3FEEC9E8, 0x2385C719, 0x00000000
853 //
854 // Entries SC_inv in Swapped IEEE format (extended)
855 // Index = 0,1,...,31 B = 2^(-2)*(1+Index/32+1/64)
856 //
857 data4 0x1BF30C9E, 0x839D6D4A, 0x00004001, 0x00000000
858 data4 0x554B0EB0, 0x80092804, 0x00004001, 0x00000000
859 data4 0xA1CF0DE9, 0xF959F94C, 0x00004000, 0x00000000
860 data4 0x77378677, 0xF3086BA0, 0x00004000, 0x00000000
861 data4 0xCCD4723C, 0xED154515, 0x00004000, 0x00000000
862 data4 0x1C27CF25, 0xE7790944, 0x00004000, 0x00000000
863 data4 0x8DDACB88, 0xE22D037D, 0x00004000, 0x00000000
864 data4 0x89C73522, 0xDD2B2D8A, 0x00004000, 0x00000000
865 data4 0xBB2C1171, 0xD86E1A23, 0x00004000, 0x00000000
866 data4 0xDFF5E0F9, 0xD3F0E288, 0x00004000, 0x00000000
867 data4 0x283BEBD5, 0xCFAF16B1, 0x00004000, 0x00000000
868 data4 0x0D88DD53, 0xCBA4AFAA, 0x00004000, 0x00000000
869 data4 0xCA67C43D, 0xC7CE03CC, 0x00004000, 0x00000000
870 data4 0x0CA0DDB0, 0xC427BC82, 0x00004000, 0x00000000
871 data4 0xF13D8CAB, 0xC0AECD57, 0x00004000, 0x00000000
872 data4 0x71ECE6B1, 0xBD606C38, 0x00004000, 0x00000000
873 data4 0xA44C4929, 0xBA3A0A96, 0x00004000, 0x00000000
874 data4 0xE5CCCEC1, 0xB7394F6F, 0x00004000, 0x00000000
875 data4 0x9637D8BC, 0xB45C1203, 0x00004000, 0x00000000
876 data4 0x92CB051B, 0xB1A05528, 0x00004000, 0x00000000
877 data4 0x6BA2FFD0, 0xAF04432B, 0x00004000, 0x00000000
878 data4 0x7221235F, 0xAC862A23, 0x00004000, 0x00000000
879 data4 0x5F00A9D1, 0xAA2478AF, 0x00004000, 0x00000000
880 data4 0x81E082BF, 0xA7DDBB0C, 0x00004000, 0x00000000
881 data4 0x45684FEE, 0xA5B0987D, 0x00004000, 0x00000000
882 data4 0x627A8F53, 0xA39BD0F5, 0x00004000, 0x00000000
883 data4 0x6EC5C8B0, 0xA19E3B03, 0x00004000, 0x00000000
884 data4 0x91CD7C66, 0x9FB6C1F0, 0x00004000, 0x00000000
885 data4 0x1FA3DF8A, 0x9DE46410, 0x00004000, 0x00000000
886 data4 0xA8F6B888, 0x9C263139, 0x00004000, 0x00000000
887 data4 0xC27B0450, 0x9A7B4968, 0x00004000, 0x00000000
888 data4 0x5EE614EE, 0x98E2DB7E, 0x00004000, 0x00000000
889 //
890 // Entries SC_inv in Swapped IEEE format (extended)
891 // Index = 0,1,...,19 B = 2^(-1)*(1+Index/32+1/64)
892 //
893 data4 0x13B2B5BA, 0x969F335C, 0x00004000, 0x00000000
894 data4 0xD4C0F548, 0x93D446D9, 0x00004000, 0x00000000
895 data4 0x61B798AF, 0x9147094F, 0x00004000, 0x00000000
896 data4 0x758787AC, 0x8EF317CC, 0x00004000, 0x00000000
897 data4 0xB99EEFDB, 0x8CD498B3, 0x00004000, 0x00000000
898 data4 0xDFF8BC37, 0x8AE82A7D, 0x00004000, 0x00000000
899 data4 0xE3C55D42, 0x892AD546, 0x00004000, 0x00000000
900 data4 0xD15573C1, 0x8799FEA9, 0x00004000, 0x00000000
901 data4 0x435A4B4C, 0x86335F88, 0x00004000, 0x00000000
902 data4 0x3E93A87B, 0x84F4FB6E, 0x00004000, 0x00000000
903 data4 0x80A382FB, 0x83DD1952, 0x00004000, 0x00000000
904 data4 0xA4CB8C9E, 0x82EA3D7F, 0x00004000, 0x00000000
905 data4 0x6861D0A8, 0x821B247C, 0x00004000, 0x00000000
906 data4 0x63E8D244, 0x816EBED1, 0x00004000, 0x00000000
907 data4 0x27E4CFC6, 0x80E42D91, 0x00004000, 0x00000000
908 data4 0x28E64AFD, 0x807ABF8D, 0x00004000, 0x00000000
909 data4 0x863B4FD8, 0x8031EF26, 0x00004000, 0x00000000
910 data4 0xAE8C11FD, 0x800960AD, 0x00004000, 0x00000000
911 data4 0x5FDBEC21, 0x8000E147, 0x00004000, 0x00000000
912 data4 0xA07791FA, 0x80186650, 0x00004000, 0x00000000
913
914 Arg = f8
915 Result = f8
916 fp_tmp = f9
917 U_2 = f10
918 rsq = f11
919 C_hi = f12
920 C_lo = f13
921 T_hi = f14
922 T_lo = f15
923
924 N_0 = f32
925 d_1 = f33
926 MPI_BY_4 = f34
927 tail = f35
928 tanx = f36
929 Cx = f37
930 Sx = f38
931 sgn_r = f39
932 CORR = f40
933 P = f41
934 D = f42
935 ArgPrime = f43
936 P_0 = f44
937
938 P2_1 = f45
939 P2_2 = f46
940 P2_3 = f47
941
942 P1_1 = f45
943 P1_2 = f46
944 P1_3 = f47
945
946 P1_4 = f48
947 P1_5 = f49
948 P1_6 = f50
949 P1_7 = f51
950 P1_8 = f52
951 P1_9 = f53
952
953 TWO_TO_63 = f54
954 NEGTWO_TO_63 = f55
955 x = f56
956 xsq = f57
957 Tx = f58
958 Tx1 = f59
959 Set = f60
960 poly1 = f61
961 poly2 = f62
962 Poly = f63
963 Poly1 = f64
964 Poly2 = f65
965 r_to_the_8 = f66
966 B = f67
967 SC_inv = f68
968 Pos_r = f69
969 N_0_fix = f70
970 PI_BY_4 = f71
971 NEGTWO_TO_NEG2 = f72
972 TWO_TO_24 = f73
973 TWO_TO_NEG14 = f74
974 TWO_TO_NEG33 = f75
975 NEGTWO_TO_24 = f76
976 NEGTWO_TO_NEG14 = f76
977 NEGTWO_TO_NEG33 = f77
978 two_by_PI = f78
979 N = f79
980 N_fix = f80
981 P_1 = f81
982 P_2 = f82
983 P_3 = f83
984 s_val = f84
985 w = f85
986 c = f86
987 r = f87
988 Z = f88
989 A = f89
990 a = f90
991 t = f91
992 U_1 = f92
993 d_2 = f93
994 TWO_TO_NEG2 = f94
995 Q1_1 = f95
996 Q1_2 = f96
997 Q1_3 = f97
998 Q1_4 = f98
999 Q1_5 = f99
1000 Q1_6 = f100
1001 Q1_7 = f101
1002 Q1_8 = f102
1003 S_hi = f103
1004 S_lo = f104
1005 V_hi = f105
1006 V_lo = f106
1007 U_hi = f107
1008 U_lo = f108
1009 U_hiabs = f109
1010 V_hiabs = f110
1011 V = f111
1012 Inv_P_0 = f112
1013
1014 GR_SAVE_B0 = r33
1015 GR_SAVE_GP = r34
1016 GR_SAVE_PFS = r35
1017
1018 delta1 = r36
1019 table_ptr1 = r37
1020 table_ptr2 = r38
1021 i_0 = r39
1022 i_1 = r40
1023 N_fix_gr = r41
1024 N_inc = r42
1025 exp_Arg = r43
1026 exp_r = r44
1027 sig_r = r45
1028 lookup = r46
1029 table_offset = r47
1030 Create_B = r48
1031 gr_tmp = r49
1032
1033 GR_Parameter_X = r49
1034 GR_Parameter_r = r50
1035
1036
1037
1038 .global __libm_tan
1039 .section .text
1040 .proc __libm_tan
1041
1042
1043 __libm_tan:
1044
1045 { .mfi
1046 alloc r32 = ar.pfs, 0,17,2,0
1047 (p0) fclass.m.unc p6,p0 = Arg, 0x1E7
1048 addl gr_tmp = -1,r0
1049 }
1050 ;;
1051
1052 { .mfi
1053 nop.m 999
1054 (p0) fclass.nm.unc p7,p0 = Arg, 0x1FF
1055 nop.i 999
1056 }
1057 ;;
1058
1059 { .mfi
1060 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
1061 nop.f 999
1062 nop.i 999
1063 }
1064 ;;
1065
1066 { .mmi
1067 ld8 table_ptr1 = [table_ptr1]
1068 setf.sig fp_tmp = gr_tmp // Make a constant so fmpy produces inexact
1069 nop.i 999
1070 }
1071 ;;
1072
1073 //
1074 // Check for NatVals, Infs , NaNs, and Zeros
1075 // Check for everything - if false, then must be pseudo-zero
1076 // or pseudo-nan.
1077 // Local table pointer
1078 //
1079
1080 { .mbb
1081 (p0) add table_ptr2 = 96, table_ptr1
1082 (p6) br.cond.spnt __libm_TAN_SPECIAL
1083 (p7) br.cond.spnt __libm_TAN_SPECIAL ;;
1084 }
1085 //
1086 // Point to Inv_P_0
1087 // Branch out to deal with unsupporteds and special values.
1088 //
1089
1090 { .mmf
1091 (p0) ldfs TWO_TO_24 = [table_ptr1],4
1092 (p0) ldfs TWO_TO_63 = [table_ptr2],4
1093 //
1094 // Load -2**24, load -2**63.
1095 //
1096 (p0) fcmp.eq.s0 p0, p6 = Arg, f1 ;;
1097 }
1098
1099 { .mfi
1100 (p0) ldfs NEGTWO_TO_63 = [table_ptr2],12
1101 (p0) fnorm.s1 Arg = Arg
1102 nop.i 999
1103 }
1104 //
1105 // Load 2**24, Load 2**63.
1106 //
1107
1108 { .mmi
1109 (p0) ldfs NEGTWO_TO_24 = [table_ptr1],12 ;;
1110 //
1111 // Do fcmp to generate Denormal exception
1112 // - can't do FNORM (will generate Underflow when U is unmasked!)
1113 // Normalize input argument.
1114 //
1115 (p0) ldfe two_by_PI = [table_ptr1],16
1116 nop.i 999
1117 }
1118
1119 { .mmi
1120 (p0) ldfe Inv_P_0 = [table_ptr2],16 ;;
1121 (p0) ldfe d_1 = [table_ptr2],16
1122 nop.i 999
1123 }
1124 //
1125 // Decide about the paths to take:
1126 // PR_1 and PR_3 set if -2**24 < Arg < 2**24 - CASE 1 OR 2
1127 // OTHERWISE - CASE 3 OR 4
1128 // Load inverse of P_0 .
1129 // Set PR_6 if Arg <= -2**63
1130 // Are there any Infs, NaNs, or zeros?
1131 //
1132
1133 { .mmi
1134 (p0) ldfe P_0 = [table_ptr1],16 ;;
1135 (p0) ldfe d_2 = [table_ptr2],16
1136 nop.i 999
1137 }
1138 //
1139 // Set PR_8 if Arg <= -2**24
1140 // Set PR_6 if Arg >= 2**63
1141 //
1142
1143 { .mmi
1144 (p0) ldfe P_1 = [table_ptr1],16 ;;
1145 (p0) ldfe PI_BY_4 = [table_ptr2],16
1146 nop.i 999
1147 }
1148 //
1149 // Set PR_8 if Arg >= 2**24
1150 //
1151
1152 { .mmi
1153 (p0) ldfe P_2 = [table_ptr1],16 ;;
1154 (p0) ldfe MPI_BY_4 = [table_ptr2],16
1155 nop.i 999
1156 }
1157 //
1158 // Load P_2 and PI_BY_4
1159 //
1160
1161 { .mfi
1162 (p0) ldfe P_3 = [table_ptr1],16
1163 nop.f 999
1164 nop.i 999 ;;
1165 }
1166
1167 { .mfi
1168 nop.m 999
1169 (p0) fcmp.le.unc.s1 p6,p7 = Arg,NEGTWO_TO_63
1170 nop.i 999
1171 }
1172
1173 { .mfi
1174 nop.m 999
1175 (p0) fcmp.le.unc.s1 p8,p9 = Arg,NEGTWO_TO_24
1176 nop.i 999 ;;
1177 }
1178
1179 { .mfi
1180 nop.m 999
1181 (p7) fcmp.ge.s1 p6,p0 = Arg,TWO_TO_63
1182 nop.i 999
1183 }
1184
1185 { .mfi
1186 nop.m 999
1187 (p9) fcmp.ge.s1 p8,p0 = Arg,TWO_TO_24
1188 nop.i 999 ;;
1189 }
1190
1191 { .mib
1192 nop.m 999
1193 nop.i 999
1194 //
1195 // Load P_3 and -PI_BY_4
1196 //
1197 (p6) br.cond.spnt TAN_ARG_TOO_LARGE ;;
1198 }
1199
1200 { .mib
1201 nop.m 999
1202 nop.i 999
1203 //
1204 // Load 2**(-2).
1205 // Load -2**(-2).
1206 // Branch out if we have a special argument.
1207 // Branch out if the magnitude of the input argument is too large
1208 // - do this branch before the next.
1209 //
1210 (p8) br.cond.spnt TAN_LARGER_ARG ;;
1211 }
1212 //
1213 // Branch to Cases 3 or 4 if Arg <= -2**24 or Arg >= 2**24
1214 //
1215
1216 { .mfi
1217 (p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
1218 // ARGUMENT REDUCTION CODE - CASE 1 and 2
1219 // Load 2**(-2).
1220 // Load -2**(-2).
1221 (p0) fmpy.s1 N = Arg,two_by_PI
1222 nop.i 999 ;;
1223 }
1224
1225 { .mfi
1226 (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],12
1227 //
1228 // N = Arg * 2/pi
1229 //
1230 (p0) fcmp.lt.unc.s1 p8,p9= Arg,PI_BY_4
1231 nop.i 999 ;;
1232 }
1233
1234 { .mfi
1235 nop.m 999
1236 //
1237 // if Arg < pi/4, set PR_8.
1238 //
1239 (p8) fcmp.gt.s1 p8,p9= Arg,MPI_BY_4
1240 nop.i 999 ;;
1241 }
1242 //
1243 // Case 1: Is |r| < 2**(-2).
1244 // Arg is the same as r in this case.
1245 // r = Arg
1246 // c = 0
1247 //
1248
1249 { .mfi
1250 (p8) mov N_fix_gr = r0
1251 //
1252 // if Arg > -pi/4, reset PR_8.
1253 // Select the case when |Arg| < pi/4 - set PR[8] = true.
1254 // Else Select the case when |Arg| >= pi/4 - set PR[9] = true.
1255 //
1256 (p0) fcvt.fx.s1 N_fix = N
1257 nop.i 999 ;;
1258 }
1259
1260 { .mfi
1261 nop.m 999
1262 //
1263 // Grab the integer part of N .
1264 //
1265 (p8) mov r = Arg
1266 nop.i 999
1267 }
1268
1269 { .mfi
1270 nop.m 999
1271 (p8) mov c = f0
1272 nop.i 999 ;;
1273 }
1274
1275 { .mfi
1276 nop.m 999
1277 (p8) fcmp.lt.unc.s1 p10, p11 = Arg, TWO_TO_NEG2
1278 nop.i 999 ;;
1279 }
1280
1281 { .mfi
1282 nop.m 999
1283 (p10) fcmp.gt.s1 p10,p0 = Arg, NEGTWO_TO_NEG2
1284 nop.i 999 ;;
1285 }
1286
1287 { .mfi
1288 nop.m 999
1289 //
1290 // Case 2: Place integer part of N in GP register.
1291 //
1292 (p9) fcvt.xf N = N_fix
1293 nop.i 999 ;;
1294 }
1295
1296 { .mib
1297 (p9) getf.sig N_fix_gr = N_fix
1298 nop.i 999
1299 //
1300 // Case 2: Convert integer N_fix back to normalized floating-point value.
1301 //
1302 (p10) br.cond.spnt TAN_SMALL_R ;;
1303 }
1304
1305 { .mib
1306 nop.m 999
1307 nop.i 999
1308 (p8) br.cond.sptk TAN_NORMAL_R ;;
1309 }
1310 //
1311 // Case 1: PR_3 is only affected when PR_1 is set.
1312 //
1313
1314 { .mmi
1315 (p9) ldfs TWO_TO_NEG33 = [table_ptr2], 4 ;;
1316 //
1317 // Case 2: Load 2**(-33).
1318 //
1319 (p9) ldfs NEGTWO_TO_NEG33 = [table_ptr2], 4
1320 nop.i 999 ;;
1321 }
1322
1323 { .mfi
1324 nop.m 999
1325 //
1326 // Case 2: Load -2**(-33).
1327 //
1328 (p9) fnma.s1 s_val = N, P_1, Arg
1329 nop.i 999
1330 }
1331
1332 { .mfi
1333 nop.m 999
1334 (p9) fmpy.s1 w = N, P_2
1335 nop.i 999 ;;
1336 }
1337
1338 { .mfi
1339 nop.m 999
1340 //
1341 // Case 2: w = N * P_2
1342 // Case 2: s_val = -N * P_1 + Arg
1343 //
1344 (p0) fcmp.lt.unc.s1 p9,p8 = s_val, TWO_TO_NEG33
1345 nop.i 999 ;;
1346 }
1347
1348 { .mfi
1349 nop.m 999
1350 //
1351 // Decide between case_1 and case_2 reduce:
1352 //
1353 (p9) fcmp.gt.s1 p9, p8 = s_val, NEGTWO_TO_NEG33
1354 nop.i 999 ;;
1355 }
1356
1357 { .mfi
1358 nop.m 999
1359 //
1360 // Case 1_reduce: s <= -2**(-33) or s >= 2**(-33)
1361 // Case 2_reduce: -2**(-33) < s < 2**(-33)
1362 //
1363 (p8) fsub.s1 r = s_val, w
1364 nop.i 999
1365 }
1366
1367 { .mfi
1368 nop.m 999
1369 (p9) fmpy.s1 w = N, P_3
1370 nop.i 999 ;;
1371 }
1372
1373 { .mfi
1374 nop.m 999
1375 (p9) fma.s1 U_1 = N, P_2, w
1376 nop.i 999
1377 }
1378
1379 { .mfi
1380 nop.m 999
1381 //
1382 // Case 1_reduce: Is |r| < 2**(-2), if so set PR_10
1383 // else set PR_11.
1384 //
1385 (p8) fsub.s1 c = s_val, r
1386 nop.i 999 ;;
1387 }
1388
1389 { .mfi
1390 nop.m 999
1391 //
1392 // Case 1_reduce: r = s + w (change sign)
1393 // Case 2_reduce: w = N * P_3 (change sign)
1394 //
1395 (p8) fcmp.lt.unc.s1 p10, p11 = r, TWO_TO_NEG2
1396 nop.i 999 ;;
1397 }
1398
1399 { .mfi
1400 nop.m 999
1401 (p10) fcmp.gt.s1 p10, p11 = r, NEGTWO_TO_NEG2
1402 nop.i 999 ;;
1403 }
1404
1405 { .mfi
1406 nop.m 999
1407 (p9) fsub.s1 r = s_val, U_1
1408 nop.i 999
1409 }
1410
1411 { .mfi
1412 nop.m 999
1413 //
1414 // Case 1_reduce: c is complete here.
1415 // c = c + w (w has not been negated.)
1416 // Case 2_reduce: r is complete here - continue to calculate c .
1417 // r = s - U_1
1418 //
1419 (p9) fms.s1 U_2 = N, P_2, U_1
1420 nop.i 999 ;;
1421 }
1422
1423 { .mfi
1424 nop.m 999
1425 //
1426 // Case 1_reduce: c = s - r
1427 // Case 2_reduce: U_1 = N * P_2 + w
1428 //
1429 (p8) fsub.s1 c = c, w
1430 nop.i 999 ;;
1431 }
1432
1433 { .mfi
1434 nop.m 999
1435 (p9) fsub.s1 s_val = s_val, r
1436 nop.i 999
1437 }
1438
1439 { .mfb
1440 nop.m 999
1441 //
1442 // Case 2_reduce:
1443 // U_2 = N * P_2 - U_1
1444 // Not needed until later.
1445 //
1446 (p9) fadd.s1 U_2 = U_2, w
1447 //
1448 // Case 2_reduce:
1449 // s = s - r
1450 // U_2 = U_2 + w
1451 //
1452 (p10) br.cond.spnt TAN_SMALL_R ;;
1453 }
1454
1455 { .mib
1456 nop.m 999
1457 nop.i 999
1458 (p11) br.cond.sptk TAN_NORMAL_R ;;
1459 }
1460
1461 { .mii
1462 nop.m 999
1463 //
1464 // Case 2_reduce:
1465 // c = c - U_2
1466 // c is complete here
1467 // Argument reduction ends here.
1468 //
1469 (p9) extr.u i_1 = N_fix_gr, 0, 1 ;;
1470 (p9) cmp.eq.unc p11, p12 = 0x0000,i_1 ;;
1471 }
1472
1473 { .mfi
1474 nop.m 999
1475 //
1476 // Is i_1 even or odd?
1477 // if i_1 == 0, set p11, else set p12.
1478 //
1479 (p11) fmpy.s1 rsq = r, Z
1480 nop.i 999 ;;
1481 }
1482
1483 { .mfi
1484 nop.m 999
1485 (p12) frcpa.s1 S_hi,p0 = f1, r
1486 nop.i 999
1487 }
1488
1489 //
1490 // Case 1: Branch to SMALL_R or NORMAL_R.
1491 // Case 1 is done now.
1492 //
1493
1494 { .mfi
1495 (p9) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
1496 (p9) fsub.s1 c = s_val, U_1
1497 nop.i 999 ;;
1498 }
1499 ;;
1500
1501 { .mmi
1502 (p9) ld8 table_ptr1 = [table_ptr1]
1503 nop.m 999
1504 nop.i 999
1505 }
1506 ;;
1507
1508 { .mmi
1509 (p9) add table_ptr1 = 224, table_ptr1 ;;
1510 (p9) ldfe P1_1 = [table_ptr1],144
1511 nop.i 999 ;;
1512 }
1513 //
1514 // Get [i_1] - lsb of N_fix_gr .
1515 // Load P1_1 and point to Q1_1 .
1516 //
1517
1518 { .mfi
1519 (p9) ldfe Q1_1 = [table_ptr1] , 0
1520 //
1521 // N even: rsq = r * Z
1522 // N odd: S_hi = frcpa(r)
1523 //
1524 (p12) fmerge.ns S_hi = S_hi, S_hi
1525 nop.i 999
1526 }
1527
1528 { .mfi
1529 nop.m 999
1530 //
1531 // Case 2_reduce:
1532 // c = s - U_1
1533 //
1534 (p9) fsub.s1 c = c, U_2
1535 nop.i 999 ;;
1536 }
1537
1538 { .mfi
1539 nop.m 999
1540 (p12) fma.s1 poly1 = S_hi, r, f1
1541 nop.i 999 ;;
1542 }
1543
1544 { .mfi
1545 nop.m 999
1546 //
1547 // N odd: Change sign of S_hi
1548 //
1549 (p11) fmpy.s1 rsq = rsq, P1_1
1550 nop.i 999 ;;
1551 }
1552
1553 { .mfi
1554 nop.m 999
1555 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1556 nop.i 999 ;;
1557 }
1558
1559 { .mfi
1560 nop.m 999
1561 //
1562 // N even: rsq = rsq * P1_1
1563 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
1564 //
1565 (p11) fma.s1 Result = r, rsq, c
1566 nop.i 999 ;;
1567 }
1568
1569 { .mfi
1570 nop.m 999
1571 //
1572 // N even: Result = c + r * rsq
1573 // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
1574 //
1575 (p12) fma.s1 poly1 = S_hi, r, f1
1576 nop.i 999 ;;
1577 }
1578
1579 { .mfi
1580 nop.m 999
1581 //
1582 // N even: Result = Result + r
1583 // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
1584 //
1585 (p11) fadd.s0 Result = r, Result
1586 nop.i 999 ;;
1587 }
1588
1589 { .mfi
1590 nop.m 999
1591 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1592 nop.i 999 ;;
1593 }
1594
1595 { .mfi
1596 nop.m 999
1597 //
1598 // N even: Result1 = Result + r
1599 // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
1600 //
1601 (p12) fma.s1 poly1 = S_hi, r, f1
1602 nop.i 999 ;;
1603 }
1604
1605 { .mfi
1606 nop.m 999
1607 //
1608 // N odd: poly1 = S_hi * r + 1.0 64 bits partial
1609 //
1610 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
1611 nop.i 999 ;;
1612 }
1613
1614 { .mfi
1615 nop.m 999
1616 //
1617 // N odd: poly1 = S_hi * poly + 1.0 64 bits
1618 //
1619 (p12) fma.s1 poly1 = S_hi, r, f1
1620 nop.i 999 ;;
1621 }
1622
1623 { .mfi
1624 nop.m 999
1625 //
1626 // N odd: poly1 = S_hi * r + 1.0
1627 //
1628 (p12) fma.s1 poly1 = S_hi, c, poly1
1629 nop.i 999 ;;
1630 }
1631
1632 { .mfi
1633 nop.m 999
1634 //
1635 // N odd: poly1 = S_hi * c + poly1
1636 //
1637 (p12) fmpy.s1 S_lo = S_hi, poly1
1638 nop.i 999 ;;
1639 }
1640
1641 { .mfi
1642 nop.m 999
1643 //
1644 // N odd: S_lo = S_hi * poly1
1645 //
1646 (p12) fma.s1 S_lo = Q1_1, r, S_lo
1647 nop.i 999
1648 }
1649
1650 { .mfi
1651 nop.m 999
1652 //
1653 // N odd: Result = S_hi + S_lo
1654 //
1655 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
1656 nop.i 999 ;;
1657 }
1658
1659 { .mfb
1660 nop.m 999
1661 //
1662 // N odd: S_lo = S_lo + Q1_1 * r
1663 //
1664 (p12) fadd.s0 Result = S_hi, S_lo
1665 (p0) br.ret.sptk b0 ;;
1666 }
1667
1668
1669 TAN_LARGER_ARG:
1670
1671 { .mmf
1672 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
1673 nop.m 999
1674 (p0) fmpy.s1 N_0 = Arg, Inv_P_0
1675 }
1676 ;;
1677
1678 //
1679 // ARGUMENT REDUCTION CODE - CASE 3 and 4
1680 //
1681 //
1682 // Adjust table_ptr1 to beginning of table.
1683 // N_0 = Arg * Inv_P_0
1684 //
1685
1686
1687 { .mmi
1688 (p0) ld8 table_ptr1 = [table_ptr1]
1689 nop.m 999
1690 nop.i 999
1691 }
1692 ;;
1693
1694
1695 { .mmi
1696 (p0) add table_ptr1 = 8, table_ptr1 ;;
1697 //
1698 // Point to 2*-14
1699 //
1700 (p0) ldfs TWO_TO_NEG14 = [table_ptr1], 4
1701 nop.i 999 ;;
1702 }
1703 //
1704 // Load 2**(-14).
1705 //
1706
1707 { .mmi
1708 (p0) ldfs NEGTWO_TO_NEG14 = [table_ptr1], 180 ;;
1709 //
1710 // N_0_fix = integer part of N_0 .
1711 // Adjust table_ptr1 to beginning of table.
1712 //
1713 (p0) ldfs TWO_TO_NEG2 = [table_ptr1], 4
1714 nop.i 999 ;;
1715 }
1716 //
1717 // Make N_0 the integer part.
1718 //
1719
1720 { .mfi
1721 (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr1]
1722 //
1723 // Load -2**(-14).
1724 //
1725 (p0) fcvt.fx.s1 N_0_fix = N_0
1726 nop.i 999 ;;
1727 }
1728
1729 { .mfi
1730 nop.m 999
1731 (p0) fcvt.xf N_0 = N_0_fix
1732 nop.i 999 ;;
1733 }
1734
1735 { .mfi
1736 nop.m 999
1737 (p0) fnma.s1 ArgPrime = N_0, P_0, Arg
1738 nop.i 999
1739 }
1740
1741 { .mfi
1742 nop.m 999
1743 (p0) fmpy.s1 w = N_0, d_1
1744 nop.i 999 ;;
1745 }
1746
1747 { .mfi
1748 nop.m 999
1749 //
1750 // ArgPrime = -N_0 * P_0 + Arg
1751 // w = N_0 * d_1
1752 //
1753 (p0) fmpy.s1 N = ArgPrime, two_by_PI
1754 nop.i 999 ;;
1755 }
1756
1757 { .mfi
1758 nop.m 999
1759 //
1760 // N = ArgPrime * 2/pi
1761 //
1762 (p0) fcvt.fx.s1 N_fix = N
1763 nop.i 999 ;;
1764 }
1765
1766 { .mfi
1767 nop.m 999
1768 //
1769 // N_fix is the integer part.
1770 //
1771 (p0) fcvt.xf N = N_fix
1772 nop.i 999 ;;
1773 }
1774
1775 { .mfi
1776 (p0) getf.sig N_fix_gr = N_fix
1777 nop.f 999
1778 nop.i 999 ;;
1779 }
1780
1781 { .mfi
1782 nop.m 999
1783 //
1784 // N is the integer part of the reduced-reduced argument.
1785 // Put the integer in a GP register.
1786 //
1787 (p0) fnma.s1 s_val = N, P_1, ArgPrime
1788 nop.i 999
1789 }
1790
1791 { .mfi
1792 nop.m 999
1793 (p0) fnma.s1 w = N, P_2, w
1794 nop.i 999 ;;
1795 }
1796
1797 { .mfi
1798 nop.m 999
1799 //
1800 // s_val = -N*P_1 + ArgPrime
1801 // w = -N*P_2 + w
1802 //
1803 (p0) fcmp.lt.unc.s1 p11, p10 = s_val, TWO_TO_NEG14
1804 nop.i 999 ;;
1805 }
1806
1807 { .mfi
1808 nop.m 999
1809 (p11) fcmp.gt.s1 p11, p10 = s_val, NEGTWO_TO_NEG14
1810 nop.i 999 ;;
1811 }
1812
1813 { .mfi
1814 nop.m 999
1815 //
1816 // Case 3: r = s_val + w (Z complete)
1817 // Case 4: U_hi = N_0 * d_1
1818 //
1819 (p10) fmpy.s1 V_hi = N, P_2
1820 nop.i 999
1821 }
1822
1823 { .mfi
1824 nop.m 999
1825 (p11) fmpy.s1 U_hi = N_0, d_1
1826 nop.i 999 ;;
1827 }
1828
1829 { .mfi
1830 nop.m 999
1831 //
1832 // Case 3: r = s_val + w (Z complete)
1833 // Case 4: U_hi = N_0 * d_1
1834 //
1835 (p11) fmpy.s1 V_hi = N, P_2
1836 nop.i 999
1837 }
1838
1839 { .mfi
1840 nop.m 999
1841 (p11) fmpy.s1 U_hi = N_0, d_1
1842 nop.i 999 ;;
1843 }
1844
1845 { .mfi
1846 nop.m 999
1847 //
1848 // Decide between case 3 and 4:
1849 // Case 3: s <= -2**(-14) or s >= 2**(-14)
1850 // Case 4: -2**(-14) < s < 2**(-14)
1851 //
1852 (p10) fadd.s1 r = s_val, w
1853 nop.i 999
1854 }
1855
1856 { .mfi
1857 nop.m 999
1858 (p11) fmpy.s1 w = N, P_3
1859 nop.i 999 ;;
1860 }
1861
1862 { .mfi
1863 nop.m 999
1864 //
1865 // Case 4: We need abs of both U_hi and V_hi - dont
1866 // worry about switched sign of V_hi .
1867 //
1868 (p11) fsub.s1 A = U_hi, V_hi
1869 nop.i 999
1870 }
1871
1872 { .mfi
1873 nop.m 999
1874 //
1875 // Case 4: A = U_hi + V_hi
1876 // Note: Worry about switched sign of V_hi, so subtract instead of add.
1877 //
1878 (p11) fnma.s1 V_lo = N, P_2, V_hi
1879 nop.i 999 ;;
1880 }
1881
1882 { .mfi
1883 nop.m 999
1884 (p11) fms.s1 U_lo = N_0, d_1, U_hi
1885 nop.i 999 ;;
1886 }
1887
1888 { .mfi
1889 nop.m 999
1890 (p11) fabs V_hiabs = V_hi
1891 nop.i 999
1892 }
1893
1894 { .mfi
1895 nop.m 999
1896 //
1897 // Case 4: V_hi = N * P_2
1898 // w = N * P_3
1899 // Note the product does not include the (-) as in the writeup
1900 // so (-) missing for V_hi and w .
1901 (p10) fadd.s1 r = s_val, w
1902 nop.i 999 ;;
1903 }
1904
1905 { .mfi
1906 nop.m 999
1907 //
1908 // Case 3: c = s_val - r
1909 // Case 4: U_lo = N_0 * d_1 - U_hi
1910 //
1911 (p11) fabs U_hiabs = U_hi
1912 nop.i 999
1913 }
1914
1915 { .mfi
1916 nop.m 999
1917 (p11) fmpy.s1 w = N, P_3
1918 nop.i 999 ;;
1919 }
1920
1921 { .mfi
1922 nop.m 999
1923 //
1924 // Case 4: Set P_12 if U_hiabs >= V_hiabs
1925 //
1926 (p11) fadd.s1 C_hi = s_val, A
1927 nop.i 999 ;;
1928 }
1929
1930 { .mfi
1931 nop.m 999
1932 //
1933 // Case 4: C_hi = s_val + A
1934 //
1935 (p11) fadd.s1 t = U_lo, V_lo
1936 nop.i 999 ;;
1937 }
1938
1939 { .mfi
1940 nop.m 999
1941 //
1942 // Case 3: Is |r| < 2**(-2), if so set PR_7
1943 // else set PR_8.
1944 // Case 3: If PR_7 is set, prepare to branch to Small_R.
1945 // Case 3: If PR_8 is set, prepare to branch to Normal_R.
1946 //
1947 (p10) fsub.s1 c = s_val, r
1948 nop.i 999 ;;
1949 }
1950
1951 { .mfi
1952 nop.m 999
1953 //
1954 // Case 3: c = (s - r) + w (c complete)
1955 //
1956 (p11) fcmp.ge.unc.s1 p12, p13 = U_hiabs, V_hiabs
1957 nop.i 999
1958 }
1959
1960 { .mfi
1961 nop.m 999
1962 (p11) fms.s1 w = N_0, d_2, w
1963 nop.i 999 ;;
1964 }
1965
1966 { .mfi
1967 nop.m 999
1968 //
1969 // Case 4: V_hi = N * P_2
1970 // w = N * P_3
1971 // Note the product does not include the (-) as in the writeup
1972 // so (-) missing for V_hi and w .
1973 //
1974 (p10) fcmp.lt.unc.s1 p14, p15 = r, TWO_TO_NEG2
1975 nop.i 999 ;;
1976 }
1977
1978 { .mfi
1979 nop.m 999
1980 (p14) fcmp.gt.s1 p14, p15 = r, NEGTWO_TO_NEG2
1981 nop.i 999 ;;
1982 }
1983
1984 { .mfb
1985 nop.m 999
1986 //
1987 // Case 4: V_lo = -N * P_2 - V_hi (U_hi is in place of V_hi in writeup)
1988 // Note: the (-) is still missing for V_hi .
1989 // Case 4: w = w + N_0 * d_2
1990 // Note: the (-) is now incorporated in w .
1991 //
1992 (p10) fadd.s1 c = c, w
1993 //
1994 // Case 4: t = U_lo + V_lo
1995 // Note: remember V_lo should be (-), subtract instead of add. NO
1996 //
1997 (p14) br.cond.spnt TAN_SMALL_R ;;
1998 }
1999
2000 { .mib
2001 nop.m 999
2002 nop.i 999
2003 (p15) br.cond.spnt TAN_NORMAL_R ;;
2004 }
2005
2006 { .mfi
2007 nop.m 999
2008 //
2009 // Case 3: Vector off when |r| < 2**(-2). Recall that PR_3 will be true.
2010 // The remaining stuff is for Case 4.
2011 //
2012 (p12) fsub.s1 a = U_hi, A
2013 (p11) extr.u i_1 = N_fix_gr, 0, 1 ;;
2014 }
2015
2016 { .mfi
2017 nop.m 999
2018 //
2019 // Case 4: C_lo = s_val - C_hi
2020 //
2021 (p11) fadd.s1 t = t, w
2022 nop.i 999
2023 }
2024
2025 { .mfi
2026 nop.m 999
2027 (p13) fadd.s1 a = V_hi, A
2028 nop.i 999 ;;
2029 }
2030
2031 //
2032 // Case 4: a = U_hi - A
2033 // a = V_hi - A (do an add to account for missing (-) on V_hi
2034 //
2035
2036 { .mfi
2037 (p11) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
2038 (p11) fsub.s1 C_lo = s_val, C_hi
2039 nop.i 999
2040 }
2041 ;;
2042
2043 { .mmi
2044 (p11) ld8 table_ptr1 = [table_ptr1]
2045 nop.m 999
2046 nop.i 999
2047 }
2048 ;;
2049
2050 //
2051 // Case 4: a = (U_hi - A) + V_hi
2052 // a = (V_hi - A) + U_hi
2053 // In each case account for negative missing form V_hi .
2054 //
2055 //
2056 // Case 4: C_lo = (s_val - C_hi) + A
2057 //
2058
2059 { .mmi
2060 (p11) add table_ptr1 = 224, table_ptr1 ;;
2061 (p11) ldfe P1_1 = [table_ptr1], 16
2062 nop.i 999 ;;
2063 }
2064
2065 { .mfi
2066 (p11) ldfe P1_2 = [table_ptr1], 128
2067 //
2068 // Case 4: w = U_lo + V_lo + w
2069 //
2070 (p12) fsub.s1 a = a, V_hi
2071 nop.i 999 ;;
2072 }
2073 //
2074 // Case 4: r = C_hi + C_lo
2075 //
2076
2077 { .mfi
2078 (p11) ldfe Q1_1 = [table_ptr1], 16
2079 (p11) fadd.s1 C_lo = C_lo, A
2080 nop.i 999 ;;
2081 }
2082 //
2083 // Case 4: c = C_hi - r
2084 // Get [i_1] - lsb of N_fix_gr.
2085 //
2086
2087 { .mfi
2088 (p11) ldfe Q1_2 = [table_ptr1], 16
2089 nop.f 999
2090 nop.i 999 ;;
2091 }
2092
2093 { .mfi
2094 nop.m 999
2095 (p13) fsub.s1 a = U_hi, a
2096 nop.i 999 ;;
2097 }
2098
2099 { .mfi
2100 nop.m 999
2101 (p11) fadd.s1 t = t, a
2102 nop.i 999 ;;
2103 }
2104
2105 { .mfi
2106 nop.m 999
2107 //
2108 // Case 4: t = t + a
2109 //
2110 (p11) fadd.s1 C_lo = C_lo, t
2111 nop.i 999 ;;
2112 }
2113
2114 { .mfi
2115 nop.m 999
2116 //
2117 // Case 4: C_lo = C_lo + t
2118 //
2119 (p11) fadd.s1 r = C_hi, C_lo
2120 nop.i 999 ;;
2121 }
2122
2123 { .mfi
2124 nop.m 999
2125 (p11) fsub.s1 c = C_hi, r
2126 nop.i 999
2127 }
2128
2129 { .mfi
2130 nop.m 999
2131 //
2132 // Case 4: c = c + C_lo finished.
2133 // Is i_1 even or odd?
2134 // if i_1 == 0, set PR_4, else set PR_5.
2135 //
2136 // r and c have been computed.
2137 // We known whether this is the sine or cosine routine.
2138 // Make sure ftz mode is set - should be automatic when using wre
2139 (p0) fmpy.s1 rsq = r, r
2140 nop.i 999 ;;
2141 }
2142
2143 { .mfi
2144 nop.m 999
2145 (p11) fadd.s1 c = c , C_lo
2146 (p11) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2147 }
2148
2149 { .mfi
2150 nop.m 999
2151 (p12) frcpa.s1 S_hi, p0 = f1, r
2152 nop.i 999
2153 }
2154
2155 { .mfi
2156 nop.m 999
2157 //
2158 // N odd: Change sign of S_hi
2159 //
2160 (p11) fma.s1 Result = rsq, P1_2, P1_1
2161 nop.i 999 ;;
2162 }
2163
2164 { .mfi
2165 nop.m 999
2166 (p12) fma.s1 P = rsq, Q1_2, Q1_1
2167 nop.i 999
2168 }
2169
2170 { .mfi
2171 nop.m 999
2172 //
2173 // N odd: Result = S_hi + S_lo (User supplied rounding mode for C1)
2174 //
2175 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2176 nop.i 999 ;;
2177 }
2178
2179 { .mfi
2180 nop.m 999
2181 //
2182 // N even: rsq = r * r
2183 // N odd: S_hi = frcpa(r)
2184 //
2185 (p12) fmerge.ns S_hi = S_hi, S_hi
2186 nop.i 999
2187 }
2188
2189 { .mfi
2190 nop.m 999
2191 //
2192 // N even: rsq = rsq * P1_2 + P1_1
2193 // N odd: poly1 = 1.0 + S_hi * r 16 bits partial account for necessary
2194 //
2195 (p11) fmpy.s1 Result = rsq, Result
2196 nop.i 999 ;;
2197 }
2198
2199 { .mfi
2200 nop.m 999
2201 (p12) fma.s1 poly1 = S_hi, r,f1
2202 nop.i 999
2203 }
2204
2205 { .mfi
2206 nop.m 999
2207 //
2208 // N even: Result = Result * rsq
2209 // N odd: S_hi = S_hi + S_hi*poly1 16 bits account for necessary
2210 //
2211 (p11) fma.s1 Result = r, Result, c
2212 nop.i 999 ;;
2213 }
2214
2215 { .mfi
2216 nop.m 999
2217 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2218 nop.i 999
2219 }
2220
2221 { .mfi
2222 nop.m 999
2223 //
2224 // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
2225 //
2226 (p11) fadd.s0 Result= r, Result
2227 nop.i 999 ;;
2228 }
2229
2230 { .mfi
2231 nop.m 999
2232 (p12) fma.s1 poly1 = S_hi, r, f1
2233 nop.i 999 ;;
2234 }
2235
2236 { .mfi
2237 nop.m 999
2238 //
2239 // N even: Result = Result * r + c
2240 // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
2241 //
2242 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2243 nop.i 999 ;;
2244 }
2245
2246 { .mfi
2247 nop.m 999
2248 (p12) fma.s1 poly1 = S_hi, r, f1
2249 nop.i 999 ;;
2250 }
2251
2252 { .mfi
2253 nop.m 999
2254 //
2255 // N even: Result1 = Result + r (Rounding mode S0)
2256 // N odd: poly1 = S_hi * r + 1.0 64 bits partial
2257 //
2258 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2259 nop.i 999 ;;
2260 }
2261
2262 { .mfi
2263 nop.m 999
2264 //
2265 // N odd: poly1 = S_hi * poly + S_hi 64 bits
2266 //
2267 (p12) fma.s1 poly1 = S_hi, r, f1
2268 nop.i 999 ;;
2269 }
2270
2271 { .mfi
2272 nop.m 999
2273 //
2274 // N odd: poly1 = S_hi * r + 1.0
2275 //
2276 (p12) fma.s1 poly1 = S_hi, c, poly1
2277 nop.i 999 ;;
2278 }
2279
2280 { .mfi
2281 nop.m 999
2282 //
2283 // N odd: poly1 = S_hi * c + poly1
2284 //
2285 (p12) fmpy.s1 S_lo = S_hi, poly1
2286 nop.i 999 ;;
2287 }
2288
2289 { .mfi
2290 nop.m 999
2291 //
2292 // N odd: S_lo = S_hi * poly1
2293 //
2294 (p12) fma.s1 S_lo = P, r, S_lo
2295 nop.i 999 ;;
2296 }
2297
2298 { .mfb
2299 nop.m 999
2300 //
2301 // N odd: S_lo = S_lo + r * P
2302 //
2303 (p12) fadd.s0 Result = S_hi, S_lo
2304 (p0) br.ret.sptk b0 ;;
2305 }
2306
2307
2308 TAN_SMALL_R:
2309
2310 { .mii
2311 nop.m 999
2312 (p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
2313 (p0) cmp.eq.unc p11, p12 = 0x0000, i_1
2314 }
2315
2316 { .mfi
2317 nop.m 999
2318 (p0) fmpy.s1 rsq = r, r
2319 nop.i 999 ;;
2320 }
2321
2322 { .mfi
2323 nop.m 999
2324 (p12) frcpa.s1 S_hi, p0 = f1, r
2325 nop.i 999
2326 }
2327
2328 { .mfi
2329 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
2330 nop.f 999
2331 nop.i 999
2332 }
2333 ;;
2334
2335 { .mmi
2336 (p0) ld8 table_ptr1 = [table_ptr1]
2337 nop.m 999
2338 nop.i 999
2339 }
2340 ;;
2341
2342 // *****************************************************************
2343 // *****************************************************************
2344 // *****************************************************************
2345
2346 { .mmi
2347 (p0) add table_ptr1 = 224, table_ptr1 ;;
2348 (p0) ldfe P1_1 = [table_ptr1], 16
2349 nop.i 999 ;;
2350 }
2351 // r and c have been computed.
2352 // We known whether this is the sine or cosine routine.
2353 // Make sure ftz mode is set - should be automatic when using wre
2354 // |r| < 2**(-2)
2355
2356 { .mfi
2357 (p0) ldfe P1_2 = [table_ptr1], 16
2358 (p11) fmpy.s1 r_to_the_8 = rsq, rsq
2359 nop.i 999 ;;
2360 }
2361 //
2362 // Set table_ptr1 to beginning of constant table.
2363 // Get [i_1] - lsb of N_fix_gr.
2364 //
2365
2366 { .mfi
2367 (p0) ldfe P1_3 = [table_ptr1], 96
2368 //
2369 // N even: rsq = r * r
2370 // N odd: S_hi = frcpa(r)
2371 //
2372 (p12) fmerge.ns S_hi = S_hi, S_hi
2373 nop.i 999 ;;
2374 }
2375 //
2376 // Is i_1 even or odd?
2377 // if i_1 == 0, set PR_11.
2378 // if i_1 != 0, set PR_12.
2379 //
2380
2381 { .mfi
2382 (p11) ldfe P1_9 = [table_ptr1], -16
2383 //
2384 // N even: Poly2 = P1_7 + Poly2 * rsq
2385 // N odd: poly2 = Q1_5 + poly2 * rsq
2386 //
2387 (p11) fadd.s1 CORR = rsq, f1
2388 nop.i 999 ;;
2389 }
2390
2391 { .mmi
2392 (p11) ldfe P1_8 = [table_ptr1], -16 ;;
2393 //
2394 // N even: Poly1 = P1_2 + P1_3 * rsq
2395 // N odd: poly1 = 1.0 + S_hi * r
2396 // 16 bits partial account for necessary (-1)
2397 //
2398 (p11) ldfe P1_7 = [table_ptr1], -16
2399 nop.i 999 ;;
2400 }
2401 //
2402 // N even: Poly1 = P1_1 + Poly1 * rsq
2403 // N odd: S_hi = S_hi + S_hi * poly1) 16 bits account for necessary
2404 //
2405
2406 { .mfi
2407 (p11) ldfe P1_6 = [table_ptr1], -16
2408 //
2409 // N even: Poly2 = P1_5 + Poly2 * rsq
2410 // N odd: poly2 = Q1_3 + poly2 * rsq
2411 //
2412 (p11) fmpy.s1 r_to_the_8 = r_to_the_8, r_to_the_8
2413 nop.i 999 ;;
2414 }
2415 //
2416 // N even: Poly1 = Poly1 * rsq
2417 // N odd: poly1 = 1.0 + S_hi * r 32 bits partial
2418 //
2419
2420 { .mfi
2421 (p11) ldfe P1_5 = [table_ptr1], -16
2422 (p12) fma.s1 poly1 = S_hi, r, f1
2423 nop.i 999 ;;
2424 }
2425 //
2426 // N even: CORR = CORR * c
2427 // N odd: S_hi = S_hi * poly1 + S_hi 32 bits
2428 //
2429
2430 //
2431 // N even: Poly2 = P1_6 + Poly2 * rsq
2432 // N odd: poly2 = Q1_4 + poly2 * rsq
2433 //
2434 { .mmf
2435 (p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp
2436 (p11) ldfe P1_4 = [table_ptr1], -16
2437 (p11) fmpy.s1 CORR = CORR, c
2438 }
2439 ;;
2440
2441
2442 { .mmi
2443 (p0) ld8 table_ptr2 = [table_ptr2]
2444 nop.m 999
2445 nop.i 999
2446 }
2447 ;;
2448
2449
2450 { .mii
2451 (p0) add table_ptr2 = 464, table_ptr2
2452 nop.i 999 ;;
2453 nop.i 999
2454 }
2455
2456 { .mfi
2457 nop.m 999
2458 (p11) fma.s1 Poly1 = P1_3, rsq, P1_2
2459 nop.i 999 ;;
2460 }
2461
2462 { .mfi
2463 (p0) ldfe Q1_7 = [table_ptr2], -16
2464 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2465 nop.i 999 ;;
2466 }
2467
2468 { .mfi
2469 (p0) ldfe Q1_6 = [table_ptr2], -16
2470 (p11) fma.s1 Poly2 = P1_9, rsq, P1_8
2471 nop.i 999 ;;
2472 }
2473
2474 { .mmi
2475 (p0) ldfe Q1_5 = [table_ptr2], -16 ;;
2476 (p12) ldfe Q1_4 = [table_ptr2], -16
2477 nop.i 999 ;;
2478 }
2479
2480 { .mfi
2481 (p12) ldfe Q1_3 = [table_ptr2], -16
2482 //
2483 // N even: Poly2 = P1_8 + P1_9 * rsq
2484 // N odd: poly2 = Q1_6 + Q1_7 * rsq
2485 //
2486 (p11) fma.s1 Poly1 = Poly1, rsq, P1_1
2487 nop.i 999 ;;
2488 }
2489
2490 { .mfi
2491 (p12) ldfe Q1_2 = [table_ptr2], -16
2492 (p12) fma.s1 poly1 = S_hi, r, f1
2493 nop.i 999 ;;
2494 }
2495
2496 { .mfi
2497 (p12) ldfe Q1_1 = [table_ptr2], -16
2498 (p11) fma.s1 Poly2 = Poly2, rsq, P1_7
2499 nop.i 999 ;;
2500 }
2501
2502 { .mfi
2503 nop.m 999
2504 //
2505 // N even: CORR = rsq + 1
2506 // N even: r_to_the_8 = rsq * rsq
2507 //
2508 (p11) fmpy.s1 Poly1 = Poly1, rsq
2509 nop.i 999 ;;
2510 }
2511
2512 { .mfi
2513 nop.m 999
2514 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2515 nop.i 999
2516 }
2517
2518 { .mfi
2519 nop.m 999
2520 (p12) fma.s1 poly2 = Q1_7, rsq, Q1_6
2521 nop.i 999 ;;
2522 }
2523
2524 { .mfi
2525 nop.m 999
2526 (p11) fma.s1 Poly2 = Poly2, rsq, P1_6
2527 nop.i 999 ;;
2528 }
2529
2530 { .mfi
2531 nop.m 999
2532 (p12) fma.s1 poly1 = S_hi, r, f1
2533 nop.i 999
2534 }
2535
2536 { .mfi
2537 nop.m 999
2538 (p12) fma.s1 poly2 = poly2, rsq, Q1_5
2539 nop.i 999 ;;
2540 }
2541
2542 { .mfi
2543 nop.m 999
2544 (p11) fma.s1 Poly2= Poly2, rsq, P1_5
2545 nop.i 999 ;;
2546 }
2547
2548 { .mfi
2549 nop.m 999
2550 (p12) fma.s1 S_hi = S_hi, poly1, S_hi
2551 nop.i 999
2552 }
2553
2554 { .mfi
2555 nop.m 999
2556 (p12) fma.s1 poly2 = poly2, rsq, Q1_4
2557 nop.i 999 ;;
2558 }
2559
2560 { .mfi
2561 nop.m 999
2562 //
2563 // N even: r_to_the_8 = r_to_the_8 * r_to_the_8
2564 // N odd: poly1 = S_hi * r + 1.0 64 bits partial
2565 //
2566 (p11) fma.s1 Poly2 = Poly2, rsq, P1_4
2567 nop.i 999 ;;
2568 }
2569
2570 { .mfi
2571 nop.m 999
2572 //
2573 // N even: Result = CORR + Poly * r
2574 // N odd: P = Q1_1 + poly2 * rsq
2575 //
2576 (p12) fma.s1 poly1 = S_hi, r, f1
2577 nop.i 999
2578 }
2579
2580 { .mfi
2581 nop.m 999
2582 (p12) fma.s1 poly2 = poly2, rsq, Q1_3
2583 nop.i 999 ;;
2584 }
2585
2586 { .mfi
2587 nop.m 999
2588 //
2589 // N even: Poly2 = P1_4 + Poly2 * rsq
2590 // N odd: poly2 = Q1_2 + poly2 * rsq
2591 //
2592 (p11) fma.s1 Poly = Poly2, r_to_the_8, Poly1
2593 nop.i 999 ;;
2594 }
2595
2596 { .mfi
2597 nop.m 999
2598 (p12) fma.s1 poly1 = S_hi, c, poly1
2599 nop.i 999
2600 }
2601
2602 { .mfi
2603 nop.m 999
2604 (p12) fma.s1 poly2 = poly2, rsq, Q1_2
2605 nop.i 999 ;;
2606 }
2607
2608 { .mfi
2609 nop.m 999
2610 //
2611 // N even: Poly = Poly1 + Poly2 * r_to_the_8
2612 // N odd: S_hi = S_hi * poly1 + S_hi 64 bits
2613 //
2614 (p11) fma.s1 Result = Poly, r, CORR
2615 nop.i 999 ;;
2616 }
2617
2618 { .mfi
2619 nop.m 999
2620 //
2621 // N even: Result = r + Result (User supplied rounding mode)
2622 // N odd: poly1 = S_hi * c + poly1
2623 //
2624 (p12) fmpy.s1 S_lo = S_hi, poly1
2625 nop.i 999
2626 }
2627
2628 { .mfi
2629 nop.m 999
2630 (p12) fma.s1 P = poly2, rsq, Q1_1
2631 nop.i 999 ;;
2632 }
2633
2634 { .mfi
2635 nop.m 999
2636 //
2637 // N odd: poly1 = S_hi * r + 1.0
2638 //
2639 (p11) fadd.s0 Result = Result, r
2640 nop.i 999 ;;
2641 }
2642
2643 { .mfi
2644 nop.m 999
2645 //
2646 // N odd: S_lo = S_hi * poly1
2647 //
2648 (p12) fma.s1 S_lo = Q1_1, c, S_lo
2649 nop.i 999
2650 }
2651
2652 { .mfi
2653 nop.m 999
2654 //
2655 // N odd: Result = Result + S_hi (user supplied rounding mode)
2656 //
2657 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2658 nop.i 999 ;;
2659 }
2660
2661 { .mfi
2662 nop.m 999
2663 //
2664 // N odd: S_lo = Q1_1 * c + S_lo
2665 //
2666 (p12) fma.s1 Result = P, r, S_lo
2667 nop.i 999 ;;
2668 }
2669
2670 { .mfb
2671 nop.m 999
2672 //
2673 // N odd: Result = S_lo + r * P
2674 //
2675 (p12) fadd.s0 Result = Result, S_hi
2676 (p0) br.ret.sptk b0 ;;
2677 }
2678
2679
2680 TAN_NORMAL_R:
2681
2682 { .mfi
2683 (p0) getf.sig sig_r = r
2684 // *******************************************************************
2685 // *******************************************************************
2686 // *******************************************************************
2687 //
2688 // r and c have been computed.
2689 // Make sure ftz mode is set - should be automatic when using wre
2690 //
2691 //
2692 // Get [i_1] - lsb of N_fix_gr alone.
2693 //
2694 (p0) fmerge.s Pos_r = f1, r
2695 (p0) extr.u i_1 = N_fix_gr, 0, 1 ;;
2696 }
2697
2698 { .mfi
2699 nop.m 999
2700 (p0) fmerge.s sgn_r = r, f1
2701 (p0) cmp.eq.unc p11, p12 = 0x0000, i_1 ;;
2702 }
2703
2704 { .mfi
2705 nop.m 999
2706 nop.f 999
2707 (p0) extr.u lookup = sig_r, 58, 5
2708 }
2709
2710 { .mlx
2711 nop.m 999
2712 (p0) movl Create_B = 0x8200000000000000 ;;
2713 }
2714
2715 { .mfi
2716 (p0) addl table_ptr1 = @ltoff(TAN_BASE_CONSTANTS), gp
2717 nop.f 999
2718 (p0) dep Create_B = lookup, Create_B, 58, 5
2719 }
2720 ;;
2721
2722 //
2723 // Get [i_1] - lsb of N_fix_gr alone.
2724 // Pos_r = abs (r)
2725 //
2726
2727
2728 { .mmi
2729 ld8 table_ptr1 = [table_ptr1]
2730 nop.m 999
2731 nop.i 999
2732 }
2733 ;;
2734
2735
2736 { .mmi
2737 nop.m 999
2738 (p0) setf.sig B = Create_B
2739 //
2740 // Set table_ptr1 and table_ptr2 to base address of
2741 // constant table.
2742 //
2743 (p0) add table_ptr1 = 480, table_ptr1 ;;
2744 }
2745
2746 { .mmb
2747 nop.m 999
2748 //
2749 // Is i_1 or i_0 == 0 ?
2750 // Create the constant 1 00000 1000000000000000000000...
2751 //
2752 (p0) ldfe P2_1 = [table_ptr1], 16
2753 nop.b 999
2754 }
2755
2756 { .mmi
2757 nop.m 999 ;;
2758 (p0) getf.exp exp_r = Pos_r
2759 nop.i 999
2760 }
2761 //
2762 // Get r's exponent
2763 // Get r's significand
2764 //
2765
2766 { .mmi
2767 (p0) ldfe P2_2 = [table_ptr1], 16 ;;
2768 //
2769 // Get the 5 bits or r for the lookup. 1.xxxxx ....
2770 // from sig_r.
2771 // Grab lsb of exp of B
2772 //
2773 (p0) ldfe P2_3 = [table_ptr1], 16
2774 nop.i 999 ;;
2775 }
2776
2777 { .mii
2778 nop.m 999
2779 (p0) andcm table_offset = 0x0001, exp_r ;;
2780 (p0) shl table_offset = table_offset, 9 ;;
2781 }
2782
2783 { .mii
2784 nop.m 999
2785 //
2786 // Deposit 0 00000 1000000000000000000000... on
2787 // 1 xxxxx yyyyyyyyyyyyyyyyyyyyyy...,
2788 // getting rid of the ys.
2789 // Is B = 2** -2 or B= 2** -1? If 2**-1, then
2790 // we want an offset of 512 for table addressing.
2791 //
2792 (p0) shladd table_offset = lookup, 4, table_offset ;;
2793 //
2794 // B = ........ 1xxxxx 1000000000000000000...
2795 //
2796 (p0) add table_ptr1 = table_ptr1, table_offset ;;
2797 }
2798
2799 { .mmb
2800 nop.m 999
2801 //
2802 // B = ........ 1xxxxx 1000000000000000000...
2803 // Convert B so it has the same exponent as Pos_r
2804 //
2805 (p0) ldfd T_hi = [table_ptr1], 8
2806 nop.b 999 ;;
2807 }
2808
2809 //
2810 // x = |r| - B
2811 // Load T_hi.
2812 // Load C_hi.
2813 //
2814
2815 { .mmf
2816 (p0) addl table_ptr2 = @ltoff(TAN_BASE_CONSTANTS), gp
2817 (p0) ldfs T_lo = [table_ptr1]
2818 (p0) fmerge.se B = Pos_r, B
2819 }
2820 ;;
2821
2822 { .mmi
2823 ld8 table_ptr2 = [table_ptr2]
2824 nop.m 999
2825 nop.i 999
2826 }
2827 ;;
2828
2829 { .mii
2830 (p0) add table_ptr2 = 1360, table_ptr2
2831 nop.i 999 ;;
2832 (p0) add table_ptr2 = table_ptr2, table_offset ;;
2833 }
2834
2835 { .mfi
2836 (p0) ldfd C_hi = [table_ptr2], 8
2837 (p0) fsub.s1 x = Pos_r, B
2838 nop.i 999 ;;
2839 }
2840
2841 { .mii
2842 (p0) ldfs C_lo = [table_ptr2],255
2843 nop.i 999 ;;
2844 //
2845 // xsq = x * x
2846 // N even: Tx = T_hi * x
2847 // Load T_lo.
2848 // Load C_lo - increment pointer to get SC_inv
2849 // - cant get all the way, do an add later.
2850 //
2851 (p0) add table_ptr2 = 569, table_ptr2 ;;
2852 }
2853 //
2854 // N even: Tx1 = Tx + 1
2855 // N odd: Cx1 = 1 - Cx
2856 //
2857
2858 { .mfi
2859 (p0) ldfe SC_inv = [table_ptr2], 0
2860 nop.f 999
2861 nop.i 999 ;;
2862 }
2863
2864 { .mfi
2865 nop.m 999
2866 (p0) fmpy.s1 xsq = x, x
2867 nop.i 999
2868 }
2869
2870 { .mfi
2871 nop.m 999
2872 (p11) fmpy.s1 Tx = T_hi, x
2873 nop.i 999 ;;
2874 }
2875
2876 { .mfi
2877 nop.m 999
2878 (p12) fmpy.s1 Cx = C_hi, x
2879 nop.i 999 ;;
2880 }
2881
2882 { .mfi
2883 nop.m 999
2884 //
2885 // N odd: Cx = C_hi * x
2886 //
2887 (p0) fma.s1 P = P2_3, xsq, P2_2
2888 nop.i 999
2889 }
2890
2891 { .mfi
2892 nop.m 999
2893 //
2894 // N even and odd: P = P2_3 + P2_2 * xsq
2895 //
2896 (p11) fadd.s1 Tx1 = Tx, f1
2897 nop.i 999 ;;
2898 }
2899
2900 { .mfi
2901 nop.m 999
2902 //
2903 // N even: D = C_hi - tanx
2904 // N odd: D = T_hi + tanx
2905 //
2906 (p11) fmpy.s1 CORR = SC_inv, T_hi
2907 nop.i 999
2908 }
2909
2910 { .mfi
2911 nop.m 999
2912 (p0) fmpy.s1 Sx = SC_inv, x
2913 nop.i 999 ;;
2914 }
2915
2916 { .mfi
2917 nop.m 999
2918 (p12) fmpy.s1 CORR = SC_inv, C_hi
2919 nop.i 999 ;;
2920 }
2921
2922 { .mfi
2923 nop.m 999
2924 (p12) fsub.s1 V_hi = f1, Cx
2925 nop.i 999 ;;
2926 }
2927
2928 { .mfi
2929 nop.m 999
2930 (p0) fma.s1 P = P, xsq, P2_1
2931 nop.i 999
2932 }
2933
2934 { .mfi
2935 nop.m 999
2936 //
2937 // N even and odd: P = P2_1 + P * xsq
2938 //
2939 (p11) fma.s1 V_hi = Tx, Tx1, f1
2940 nop.i 999 ;;
2941 }
2942
2943 { .mfi
2944 nop.m 999
2945 //
2946 // N even: Result = sgn_r * tail + T_hi (user rounding mode for C1)
2947 // N odd: Result = sgn_r * tail + C_hi (user rounding mode for C1)
2948 //
2949 (p0) fmpy.s0 fp_tmp = fp_tmp, fp_tmp // Dummy mult to set inexact
2950 nop.i 999 ;;
2951 }
2952
2953 { .mfi
2954 nop.m 999
2955 (p0) fmpy.s1 CORR = CORR, c
2956 nop.i 999 ;;
2957 }
2958
2959 { .mfi
2960 nop.m 999
2961 (p12) fnma.s1 V_hi = Cx,V_hi,f1
2962 nop.i 999 ;;
2963 }
2964
2965 { .mfi
2966 nop.m 999
2967 //
2968 // N even: V_hi = Tx * Tx1 + 1
2969 // N odd: Cx1 = 1 - Cx * Cx1
2970 //
2971 (p0) fmpy.s1 P = P, xsq
2972 nop.i 999
2973 }
2974
2975 { .mfi
2976 nop.m 999
2977 //
2978 // N even and odd: P = P * xsq
2979 //
2980 (p11) fmpy.s1 V_hi = V_hi, T_hi
2981 nop.i 999 ;;
2982 }
2983
2984 { .mfi
2985 nop.m 999
2986 //
2987 // N even and odd: tail = P * tail + V_lo
2988 //
2989 (p11) fmpy.s1 T_hi = sgn_r, T_hi
2990 nop.i 999 ;;
2991 }
2992
2993 { .mfi
2994 nop.m 999
2995 (p0) fmpy.s1 CORR = CORR, sgn_r
2996 nop.i 999 ;;
2997 }
2998
2999 { .mfi
3000 nop.m 999
3001 (p12) fmpy.s1 V_hi = V_hi,C_hi
3002 nop.i 999 ;;
3003 }
3004
3005 { .mfi
3006 nop.m 999
3007 //
3008 // N even: V_hi = T_hi * V_hi
3009 // N odd: V_hi = C_hi * V_hi
3010 //
3011 (p0) fma.s1 tanx = P, x, x
3012 nop.i 999
3013 }
3014
3015 { .mfi
3016 nop.m 999
3017 (p12) fnmpy.s1 C_hi = sgn_r, C_hi
3018 nop.i 999 ;;
3019 }
3020
3021 { .mfi
3022 nop.m 999
3023 //
3024 // N even: V_lo = 1 - V_hi + C_hi
3025 // N odd: V_lo = 1 - V_hi + T_hi
3026 //
3027 (p11) fadd.s1 CORR = CORR, T_lo
3028 nop.i 999
3029 }
3030
3031 { .mfi
3032 nop.m 999
3033 (p12) fsub.s1 CORR = CORR, C_lo
3034 nop.i 999 ;;
3035 }
3036
3037 { .mfi
3038 nop.m 999
3039 //
3040 // N even and odd: tanx = x + x * P
3041 // N even and odd: Sx = SC_inv * x
3042 //
3043 (p11) fsub.s1 D = C_hi, tanx
3044 nop.i 999
3045 }
3046
3047 { .mfi
3048 nop.m 999
3049 (p12) fadd.s1 D = T_hi, tanx
3050 nop.i 999 ;;
3051 }
3052
3053 { .mfi
3054 nop.m 999
3055 //
3056 // N odd: CORR = SC_inv * C_hi
3057 // N even: CORR = SC_inv * T_hi
3058 //
3059 (p0) fnma.s1 D = V_hi, D, f1
3060 nop.i 999 ;;
3061 }
3062
3063 { .mfi
3064 nop.m 999
3065 //
3066 // N even and odd: D = 1 - V_hi * D
3067 // N even and odd: CORR = CORR * c
3068 //
3069 (p0) fma.s1 V_hi = V_hi, D, V_hi
3070 nop.i 999 ;;
3071 }
3072
3073 { .mfi
3074 nop.m 999
3075 //
3076 // N even and odd: V_hi = V_hi + V_hi * D
3077 // N even and odd: CORR = sgn_r * CORR
3078 //
3079 (p11) fnma.s1 V_lo = V_hi, C_hi, f1
3080 nop.i 999
3081 }
3082
3083 { .mfi
3084 nop.m 999
3085 (p12) fnma.s1 V_lo = V_hi, T_hi, f1
3086 nop.i 999 ;;
3087 }
3088
3089 { .mfi
3090 nop.m 999
3091 //
3092 // N even: CORR = COOR + T_lo
3093 // N odd: CORR = CORR - C_lo
3094 //
3095 (p11) fma.s1 V_lo = tanx, V_hi, V_lo
3096 nop.i 999
3097 }
3098
3099 { .mfi
3100 nop.m 999
3101 (p12) fnma.s1 V_lo = tanx, V_hi, V_lo
3102 nop.i 999 ;;
3103 }
3104
3105 { .mfi
3106 nop.m 999
3107 //
3108 // N even: V_lo = V_lo + V_hi * tanx
3109 // N odd: V_lo = V_lo - V_hi * tanx
3110 //
3111 (p11) fnma.s1 V_lo = C_lo, V_hi, V_lo
3112 nop.i 999
3113 }
3114
3115 { .mfi
3116 nop.m 999
3117 (p12) fnma.s1 V_lo = T_lo, V_hi, V_lo
3118 nop.i 999 ;;
3119 }
3120
3121 { .mfi
3122 nop.m 999
3123 //
3124 // N even: V_lo = V_lo - V_hi * C_lo
3125 // N odd: V_lo = V_lo - V_hi * T_lo
3126 //
3127 (p0) fmpy.s1 V_lo = V_hi, V_lo
3128 nop.i 999 ;;
3129 }
3130
3131 { .mfi
3132 nop.m 999
3133 //
3134 // N even and odd: V_lo = V_lo * V_hi
3135 //
3136 (p0) fadd.s1 tail = V_hi, V_lo
3137 nop.i 999 ;;
3138 }
3139
3140 { .mfi
3141 nop.m 999
3142 //
3143 // N even and odd: tail = V_hi + V_lo
3144 //
3145 (p0) fma.s1 tail = tail, P, V_lo
3146 nop.i 999 ;;
3147 }
3148
3149 { .mfi
3150 nop.m 999
3151 //
3152 // N even: T_hi = sgn_r * T_hi
3153 // N odd : C_hi = -sgn_r * C_hi
3154 //
3155 (p0) fma.s1 tail = tail, Sx, CORR
3156 nop.i 999 ;;
3157 }
3158
3159 { .mfi
3160 nop.m 999
3161 //
3162 // N even and odd: tail = Sx * tail + CORR
3163 //
3164 (p0) fma.s1 tail = V_hi, Sx, tail
3165 nop.i 999 ;;
3166 }
3167
3168 { .mfi
3169 nop.m 999
3170 //
3171 // N even an odd: tail = Sx * V_hi + tail
3172 //
3173 (p11) fma.s0 Result = sgn_r, tail, T_hi
3174 nop.i 999
3175 }
3176
3177 { .mfb
3178 nop.m 999
3179 (p12) fma.s0 Result = sgn_r, tail, C_hi
3180 (p0) br.ret.sptk b0 ;;
3181 }
3182
3183 .endp __libm_tan
3184 ASM_SIZE_DIRECTIVE(__libm_tan)
3185
3186
3187
3188 // *******************************************************************
3189 // *******************************************************************
3190 // *******************************************************************
3191 //
3192 // Special Code to handle very large argument case.
3193 // Call int pi_by_2_reduce(&x,&r)
3194 // for |arguments| >= 2**63
3195 // (Arg or x) is in f8
3196 // Address to save r and c as double
3197
3198 // (1) (2) (3) (call) (4)
3199 // sp -> + psp -> + psp -> + sp -> +
3200 // | | | |
3201 // | r50 ->| <- r50 f0 ->| r50 -> | -> c
3202 // | | | |
3203 // sp-32 -> | <- r50 f0 ->| f0 ->| <- r50 r49 -> | -> r
3204 // | | | |
3205 // | r49 ->| <- r49 Arg ->| <- r49 | -> x
3206 // | | | |
3207 // sp -64 ->| sp -64 ->| sp -64 ->| |
3208 //
3209 // save pfs save b0 restore gp
3210 // save gp restore b0
3211 // restore pfs
3212
3213
3214
3215 .proc __libm_callout
3216 __libm_callout:
3217 TAN_ARG_TOO_LARGE:
3218 .prologue
3219 // (1)
3220 { .mfi
3221 add GR_Parameter_r =-32,sp // Parameter: r address
3222 nop.f 0
3223 .save ar.pfs,GR_SAVE_PFS
3224 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
3225 }
3226 { .mfi
3227 .fframe 64
3228 add sp=-64,sp // Create new stack
3229 nop.f 0
3230 mov GR_SAVE_GP=gp // Save gp
3231 };;
3232
3233 // (2)
3234 { .mmi
3235 stfe [GR_Parameter_r ] = f0,16 // Clear Parameter r on stack
3236 add GR_Parameter_X = 16,sp // Parameter x address
3237 .save b0, GR_SAVE_B0
3238 mov GR_SAVE_B0=b0 // Save b0
3239 };;
3240
3241 // (3)
3242 .body
3243 { .mib
3244 stfe [GR_Parameter_r ] = f0,-16 // Clear Parameter c on stack
3245 nop.i 0
3246 nop.b 0
3247 }
3248 { .mib
3249 stfe [GR_Parameter_X] = Arg // Store Parameter x on stack
3250 nop.i 0
3251 (p0) br.call.sptk b0=__libm_pi_by_2_reduce#
3252 }
3253 ;;
3254
3255
3256 // (4)
3257 { .mmi
3258 mov gp = GR_SAVE_GP // Restore gp
3259 (p0) mov N_fix_gr = r8
3260 nop.i 999
3261 }
3262 ;;
3263
3264 { .mmi
3265 (p0) ldfe Arg =[GR_Parameter_X],16
3266 (p0) ldfs TWO_TO_NEG2 = [table_ptr2],4
3267 nop.i 999
3268 }
3269 ;;
3270
3271
3272 { .mmb
3273 (p0) ldfe r =[GR_Parameter_r ],16
3274 (p0) ldfs NEGTWO_TO_NEG2 = [table_ptr2],4
3275 nop.b 999 ;;
3276 }
3277
3278 { .mfi
3279 (p0) ldfe c =[GR_Parameter_r ]
3280 nop.f 999
3281 nop.i 999 ;;
3282 }
3283
3284 { .mfi
3285 nop.m 999
3286 //
3287 // Is |r| < 2**(-2)
3288 //
3289 (p0) fcmp.lt.unc.s1 p6, p0 = r, TWO_TO_NEG2
3290 mov b0 = GR_SAVE_B0 // Restore return address
3291 }
3292 ;;
3293
3294 { .mfi
3295 nop.m 999
3296 (p6) fcmp.gt.unc.s1 p6, p0 = r, NEGTWO_TO_NEG2
3297 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
3298 }
3299 ;;
3300
3301 { .mbb
3302 .restore sp
3303 add sp = 64,sp // Restore stack pointer
3304 (p6) br.cond.spnt TAN_SMALL_R
3305 (p0) br.cond.sptk TAN_NORMAL_R
3306 }
3307 ;;
3308 .endp __libm_callout
3309 ASM_SIZE_DIRECTIVE(__libm_callout)
3310
3311
3312 .proc __libm_TAN_SPECIAL
3313 __libm_TAN_SPECIAL:
3314
3315 //
3316 // Code for NaNs, Unsupporteds, Infs, or +/- zero ?
3317 // Invalid raised for Infs and SNaNs.
3318 //
3319
3320 { .mfb
3321 nop.m 999
3322 (p0) fmpy.s0 Arg = Arg, f0
3323 (p0) br.ret.sptk b0
3324 }
3325 .endp __libm_TAN_SPECIAL
3326 ASM_SIZE_DIRECTIVE(__libm_TAN_SPECIAL)
3327
3328
3329 .type __libm_pi_by_2_reduce#,@function
3330 .global __libm_pi_by_2_reduce#