initial commit
[glibc.git] / sysdeps / ia64 / fpu / s_tanhf.S
1 .file "tanhf.s"
2
3
4 // Copyright (c) 2001 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 //
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
18 //
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
22
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
38 //
39 // History
40 //==============================================================
41 // 05/30/01 Initial version
42 // 05/20/02 Cleaned up namespace and sf0 syntax
43 // 02/10/03 Reordered header: .section, .global, .proc, .align
44 // 03/31/05 Reformatted delimiters between data tables
45 //
46 // API
47 //==============================================================
48 // float tanhf(float)
49 //
50 // Overview of operation
51 //==============================================================
52 // Background
53 //
54 //
55 // There are 9 paths:
56 // 1. x = +/-0.0
57 // Return tanhf(x) = +/-0.0
58 //
59 // 2. 0.0 < |x| < 0.3125
60 // Return tanhf(x) = x + x^3*Pol3(x^2),
61 // where Pol3(x^2) = C3*x^6 + C2*x^4 + C1*x^2 + C0
62 //
63 // 3. 0.3125 <= |x| < 8.0
64 // Return tanhf(x) = sign(x)*PolD(x)*PolC(|x|) + sign(x)*PolA(|x|),
65 // where sign(x)*PolD(x) = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4),
66 // PolC(|x|) = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0,
67 // PolA(|x|) = A3|x|^3 + A2*x^2 + A1*|x| + A0
68 //
69 // Actually range 0.3125<=|x|< 8.0 is split to 5 subranges.
70 // For each subrange there is particular set of coefficients.
71 // Below is the list of subranges:
72 // 3.1 0.3125 <= |x| < 0.5
73 // 3.2 0.5 <= |x| < 1.0
74 // 3.3 1.0 <= |x| < 2.0
75 // 3.4 2.0 <= |x| < 4.0
76 // 3.5 4.0 <= |x| < 8.0
77 //
78 // 4. 8.0 <= |x| < 9.125
79 // Return tanhf(x) = sign(x)*(A3|x|^3 + A2*x^2 + A1*|x| + A0)
80 //
81 // 5. 9.125 <= |x| < +INF
82 // Return tanhf(x) = sign(x)*(1.0d - 2^(-52))
83 //
84 // 6. |x| = INF
85 // Return tanhf(x) = sign(x) * 1.0
86 //
87 // 7. x = [S,Q]NaN
88 // Return tanhf(x) = QNaN
89 //
90 // 8. x is positive denormal
91 // Return tanhf(x) = x - x^2
92 //
93 // 9. x is negative denormal
94 // Return tanhf(x) = x + x^2
95 //
96 // Registers used
97 //==============================================================
98 // Floating Point registers used:
99 // f8, input
100 // f32 -> f59
101
102 // General registers used:
103 // r32 -> r46, r2, r3
104
105 // Predicate registers used:
106 // p0, p6 -> p15
107
108 // p6 to filter out case when x = [Q,S]NaN or +/-0
109 // p7 to filter out case when x = denormal
110 // p8 set if |x| >= 0.3125, used also to process denormal input
111 // p9 to filter out case when |x| = inf
112 // p10 to filter out case when |x| < 0.3125
113 // p11 to filter out case when 0.3125 <= |x| < 9.125
114 // p12 to filter out case when |x| >= 9.125
115 // p13 to filter out case when 8.0 <= |x| < 9.125
116 // p14 set to 1 for positive x
117 // p15 set to 1 for negative x
118
119 // Assembly macros
120 //==============================================================
121 rDataPtr = r2
122 rDataPtr1 = r3
123
124 rBias = r33
125 rCoeffAddr3 = r34
126 rNearSaturation = r35
127 rCoeffAddr1 = r36
128 rCoeffAddr2 = r37
129 rOffset2 = r38
130 rBias2 = r39
131 rMask = r40
132 rArg = r41
133 rBound = r42
134 rSignBit = r43
135 rAbsArg = r44
136 rDataPtr2 = r45
137 rSaturation = r46
138
139 //==============================================================
140 fA0 = f32
141 fA1 = f33
142 fA2 = f34
143 fA3 = f35
144 fC0 = f36
145 fC1 = f37
146 fC2 = f38
147 fC3 = f39
148 fD0 = f40
149 fD1 = f41
150 fD2 = f42
151 fB0 = f43
152 fArgSqr = f44
153 fAbsArg = f45
154 fSignumX = f46
155 fArg4 = f47
156 fArg4Sgn = f48
157 fArg3 = f49
158 fArg3Sgn = f50
159 fArg7Sgn = f51
160 fArg6Sgn = f52
161 fPolC = f53
162 fPolCTmp = f54
163 fPolA = f55
164 fPolATmp = f56
165 fPolD = f57
166 fPolDTmp = f58
167 fArgSqrSgn = f59
168
169 // Data tables
170 //==============================================================
171
172 RODATA
173
174 .align 16
175
176 LOCAL_OBJECT_START(tanhf_data)
177 // Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
178 data8 0x3F9BEEDFDD177D7B // C0
179 data8 0x3F970D10C7F32458 // C1
180 data8 0x3F766D6B051F3A38 // C2
181 data8 0xBF732F2001B23402 // C3
182 data8 0xBF854BE1CE1ED499 // D0
183 data8 0x4013C944F3999A16 // D1
184 data8 0xC01106C6975222C0 // D2
185 data8 0x3F783D5ACCF9EBE8 // B0
186 // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
187 data8 0xBF5D631440786869 // C0
188 data8 0xBF575D79A0D52069 // C1
189 data8 0xBF7E2237B7EFC705 // C2
190 data8 0x3F6A7ACBC273041F // C3
191 data8 0xC040E32EA52D91EB // D0
192 data8 0x403D19463E5DB4D7 // D1
193 data8 0xC02216F61F759F39 // D2
194 data8 0xBF55B4EA0B844BE7 // B0
195 // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
196 data8 0x3F8637DBE5B3E690 // C0
197 data8 0xBF7F7FEC158C07F5 // C1
198 data8 0x3F711C586706838A // C2
199 data8 0xBF50EF7EF605554E // C3
200 data8 0xC054D45448354E25 // D0
201 data8 0x404ADFEEA282E730 // D1
202 data8 0xC028AEE456D59549 // D2
203 data8 0x3F25232D1BED59A8 // B0
204 // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
205 data8 0xBF52602285F2D06C // C0
206 data8 0x3F2E57C298FFE1E0 // C1
207 data8 0xBF15ED575DB3C811 // C2
208 data8 0x3EE428878A08525C // C3
209 data8 0xC0895A26849039C1 // D0
210 data8 0x406E3C60BBFBB575 // D1
211 data8 0xC03A06F62867C75A // D2
212 data8 0xBEB114C70F1C723E // B0
213 // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
214 data8 0x3EF4B22BD17039A3 // C0
215 data8 0xBEB704ADC040C57F // C1
216 data8 0x3E937A98288AFE1A // C2
217 data8 0xBE4F33B2C9FFE7E7 // C3
218 data8 0xC0BE48CFADE2431E // D0
219 data8 0x4090E74249760FDD // D1
220 data8 0xC04B6F537FCF2F1E // D2
221 data8 0x3E0DCD879C91ADEA // B0
222 // Polynomial coefficients for the tanh(x), -0.3125 < x < 0.3125
223 data8 0xBFD555551E8245B7 // A0
224 data8 0x3FC110E63F52E689 // A1
225 data8 0xBFAB8CD6A5B7BAFA // A2
226 data8 0x3F945D467FCEB553 // A3
227 // Polynomial coefficients for the tanh(x), 0.3125 <= |x| < 0.5
228 data8 0xBE3DCC92FCAECBB6 // A0
229 data8 0x3FF0000043B7D267 // A1
230 data8 0xBED18BF28ACFC4B1 // A2
231 data8 0xBFD554A56F82837E // A3
232 // Polynomial coefficients for the tanh(x), 0.5 <= |x| < 1.0
233 data8 0x3EFD6054758539F9 // A0
234 data8 0x3FEFFBFC77198EBE // A1
235 data8 0x3F700327CA98D237 // A2
236 data8 0xBFD68955F5BB2FA1 // A3
237 // Polynomial coefficients for the tanh(x), 1.0 <= |x| < 2.0
238 data8 0xBF71A53F229DF01B // A0
239 data8 0x3FF0AECFD730DE50 // A1
240 data8 0xBFC882F88E5DF3BA // A2
241 data8 0x3FC6EDF212CA2A8D // A3
242 // Polynomial coefficients for the tanh(x), 2.0 <= |x| < 4.0
243 data8 0xBFAF0B712E9EDA47 // A0
244 data8 0x3FF1C208080BEA64 // A1
245 data8 0x3FC3D29B20C8946E // A2
246 data8 0xBFF04514ED900A6A // A3
247 // Polynomial coefficients for the tanh(x), 4.0 <= |x| < 8.0
248 data8 0xBFB1DEA49A831CBC // A0
249 data8 0x3FFA729FC7085674 // A1
250 data8 0xBFF2F44D923A8FA4 // A2
251 data8 0x3FE092FC5712227E // A3
252 // Polynomial coefficients for the tanh(x), 8.0 <= |x| <= 9.125
253 data8 0x3FEFFF5769EE3041 // A0
254 data8 0x3EFBBF148D850891 // A1
255 data8 0xBEC86BCEF0F5C2FE // A2
256 data8 0x3E7CBA4F3A885A5C // A3
257 //
258 data8 0x3FEFFFFFFFFFFFFF // 1.0 - epsilon
259 LOCAL_OBJECT_END(tanhf_data)
260
261 .section .text
262 GLOBAL_LIBM_ENTRY(tanhf)
263
264 { .mfi
265 alloc r32 = ar.pfs, 1, 14, 0, 0
266 fmerge.s fAbsArg = f1, f8 // |x|
267 addl rMask = 0x806, r0
268 }
269 { .mfi
270 addl rDataPtr = @ltoff(tanhf_data), gp
271 fma.s1 fArgSqr = f8, f8, f0 // x^2
272 adds rSignBit = 0x1, r0
273 }
274 ;;
275
276 { .mfi
277 getf.s rArg = f8 // x in GR
278 fclass.m p7,p0 = f8, 0x0b // is x denormal ?
279 // sign bit and 2 most bits in significand
280 shl rMask = rMask, 20
281 }
282 { .mfi
283 ld8 rDataPtr = [rDataPtr]
284 nop.f 0
285 adds rBias2 = 0x1F4, r0
286 }
287 ;;
288
289 { .mfi
290 adds rNearSaturation = 0x14, r0
291 fmerge.s fSignumX = f8, f1 // signum(x)
292 shl rSignBit = rSignBit, 31 // mask for sign bit
293 }
294 { .mfi
295 adds rBound = 0x3EA, r0
296 nop.f 0
297 addl rSaturation = 0x4112, r0
298 }
299 ;;
300
301 { .mfi
302 andcm rOffset2 = rArg, rMask
303 fclass.m p6,p0 = f8, 0xc7 // is x [S,Q]NaN or +/-0 ?
304 shl rBound = rBound, 20 // 1.0f in GR
305 }
306 { .mfb
307 andcm rAbsArg = rArg, rSignBit // |x| in GR
308 nop.f 0
309 (p7) br.cond.spnt tanhf_denormal // branch out if x is denormal
310 }
311 ;;
312
313 { .mfi
314 adds rCoeffAddr2 = 352, rDataPtr
315 fclass.m p9,p0 = f8, 0x23 // is x +/- inf?
316 shr rOffset2 = rOffset2, 21
317 }
318 { .mfi
319 cmp.lt p10, p8 = rAbsArg, rBound // |x| < 0.3125?
320 nop.f 0
321 adds rCoeffAddr3 = 16, rDataPtr
322 }
323 ;;
324
325 { .mfi
326 (p8) sub rBias = rOffset2, rBias2
327 fma.s1 fArg4 = fArgSqr, fArgSqr, f0 // x^4
328 shl rSaturation = rSaturation, 16
329 }
330 { .mfb
331 (p10) adds rBias = 0x14, r0
332 (p6) fma.s.s0 f8 = f8,f1,f8 // NaN or +/-0
333 (p6) br.ret.spnt b0 // exit for x = NaN or +/-0
334 }
335 ;;
336
337 { .mfi
338 shladd rCoeffAddr1 = rBias, 4, rDataPtr
339 fma.s1 fArg3Sgn = fArgSqr, f8, f0 // sign(x)*|x|^3
340 // is |x| < 9.125?
341 cmp.lt p11, p12 = rAbsArg, rSaturation
342 }
343 { .mfi
344 shladd rCoeffAddr3 = rBias, 4, rCoeffAddr3
345 fma.s1 fArg3 = fArgSqr, fAbsArg, f0 // |x|^3
346 shladd rCoeffAddr2 = rBias, 3, rCoeffAddr2
347 }
348 ;;
349
350 { .mfi
351 (p11) ldfpd fC0, fC1 = [rCoeffAddr1]
352 (p9) fmerge.s f8 = f8,f1 // +/- inf
353 (p12) adds rDataPtr = 544, rDataPtr
354 }
355 { .mfb
356 (p11) ldfpd fC2, fC3 = [rCoeffAddr3], 16
357 nop.f 0
358 (p9) br.ret.spnt b0 // exit for x = +/- inf
359 }
360 ;;
361
362 { .mfi
363 (p11) ldfpd fA0, fA1 = [rCoeffAddr2], 16
364 nop.f 0
365 (p8) cmp.eq.unc p13, p0 = rBias, rNearSaturation
366 }
367 { .mfi
368 add rCoeffAddr1 = 48, rCoeffAddr1
369 nop.f 0
370 nop.i 0
371 }
372 ;;
373
374 { .mfi
375 (p11) ldfpd fD0, fD1 = [rCoeffAddr3]
376 nop.f 0
377 nop.i 0
378 }
379 { .mfb
380 (p11) ldfpd fD2, fB0 = [rCoeffAddr1]
381 // sign(x)*|x|^2
382 fma.s1 fArgSqrSgn = fArgSqr, fSignumX, f0
383 (p10) br.cond.spnt tanhf_near_zero
384 }
385 ;;
386
387 { .mfi
388 (p11) ldfpd fA2, fA3 = [rCoeffAddr2], 16
389 fcmp.lt.s1 p15, p14 = f8,f0
390 nop.i 0
391 }
392 { .mfb
393 (p12) ldfd fA0 = [rDataPtr]
394 fma.s1 fArg4Sgn = fArg4, fSignumX, f0 // sign(x)*|x|^4
395 (p12) br.cond.spnt tanhf_saturation
396 }
397 ;;
398 { .mfi
399 nop.m 0
400 fma.s1 fArg7Sgn = fArg4, fArg3Sgn, f0 // sign(x)*|x|^7
401 nop.i 0
402 }
403 { .mfb
404 nop.m 0
405 fma.s1 fArg6Sgn = fArg3, fArg3Sgn, f0 // sign(x)*|x|^6
406 (p13) br.cond.spnt tanhf_close_to_saturation
407 }
408 ;;
409
410 { .mfi
411 nop.m 0
412 fma.s1 fPolC = fC3, fAbsArg, fC2 // C3*|x| + C2
413 nop.i 0
414 }
415 { .mfi
416 nop.m 0
417 fma.s1 fPolCTmp = fC1, fAbsArg, fC0 // C1*|x| + C0
418 nop.i 0
419 };;
420
421 { .mfi
422 nop.m 0
423 fma.s1 fPolA = fA1, fAbsArg, fA0 // A1*|x| + A0
424 nop.i 0
425 }
426 ;;
427
428 { .mfi
429 nop.m 0
430 fma.s1 fPolD = fD1, fAbsArg, fD0 // D1*|x| + D0
431 nop.i 0
432 }
433 { .mfi
434 nop.m 0
435 // sign(x)*(|x|^7 + D2*x^6)
436 fma.s1 fPolDTmp = fArg6Sgn, fD2, fArg7Sgn
437 nop.i 0
438 };;
439
440 { .mfi
441 nop.m 0
442 fma.s1 fPolATmp = fA3, fAbsArg, fA2 // A3*|x| + A2
443 nop.i 0
444 }
445 { .mfi
446 nop.m 0
447 fma.s1 fB0 = fB0, fArg4, f0 // B0*x^4
448 nop.i 0
449 };;
450
451 { .mfi
452 nop.m 0
453 // C3*|x|^3 + C2*x^2 + C1*|x| + C0
454 fma.s1 fPolC = fPolC, fArgSqr, fPolCTmp
455 nop.i 0
456 }
457 ;;
458
459 { .mfi
460 nop.m 0
461 // PolD = sign(x)*(|x|^7 + D2*x^6 + D1*|x|^5 + D0*x^4)
462 fma.d.s1 fPolD = fPolD, fArg4Sgn, fPolDTmp
463 nop.i 0
464 }
465 ;;
466
467 { .mfi
468 nop.m 0
469 // PolA = A3|x|^3 + A2*x^2 + A1*|x| + A0
470 fma.d.s1 fPolA = fPolATmp, fArgSqr, fPolA
471 nop.i 0
472 }
473 ;;
474
475 { .mfi
476 nop.m 0
477 // PolC = B0*x^4 + C3*|x|^3 + C2*|x|^2 + C1*|x| + C0
478 fma.d.s1 fPolC = fPolC, f1, fB0
479 nop.i 0
480 }
481 ;;
482
483 { .mfi
484 nop.m 0
485 (p14) fma.s.s0 f8 = fPolC, fPolD, fPolA // for positive x
486 nop.i 0
487 }
488 { .mfb
489 nop.m 0
490 (p15) fms.s.s0 f8 = fPolC, fPolD, fPolA // for negative x
491 br.ret.sptk b0 // Exit for 0.3125 <=|x|< 8.0
492 };;
493
494
495 // Here if |x| < 0.3125
496 tanhf_near_zero:
497 { .mfi
498 nop.m 0
499 fma.s1 fPolC = fC3, fArgSqr, fC2 // C3*x^2 + C2
500 nop.i 0
501 }
502 { .mfi
503 nop.m 0
504 fma.s1 fPolCTmp = fC1, fArgSqr, fC0 // C1*x^2 + C0
505 nop.i 0
506 };;
507
508 { .mfi
509 nop.m 0
510 fma.s1 fPolC = fPolC, fArg4, fPolCTmp // C3*x^6 + C2*x^4 + C1*x^2 + C0
511 nop.i 0
512 };;
513
514 { .mfb
515 nop.m 0
516 // x + x^3*(C3*x^6 + C2*x^4 + C1*x^2 + C0)
517 fma.s.s0 f8 = fPolC, fArg3Sgn, f8
518 br.ret.sptk b0 // Exit for |x| < 0.3125
519 };;
520
521 // Here if 9.125 <= |x| < +inf
522 tanhf_saturation:
523 { .mfb
524 nop.m 0
525 fma.s.s0 f8 = fA0, fSignumX, f0 // sign(x)*(1.0d - 2^(-52))
526 // Exit for 9.125 <= |x| < +inf
527 br.ret.sptk b0 // Exit for 9.125 <=|x|< +inf
528 }
529 ;;
530
531 // Here if 8.0 <= |x| < 9.125
532 tanhf_close_to_saturation:
533 { .mfi
534 nop.m 0
535 fma.s1 fPolATmp = fA1, fAbsArg, fA0 // A1*|x| + A0
536 nop.i 0
537 }
538 { .mfi
539 nop.m 0
540 fma.s1 fPolA = fA3, fAbsArg, fA2 // A3*|x| + A2
541 nop.i 0
542 }
543 ;;
544
545 .pred.rel "mutex", p14, p15
546 { .mfi
547 nop.m 0
548 // for positive x
549 (p14) fma.s.s0 f8 = fPolA, fArgSqr, fPolATmp
550 nop.i 0
551 }
552 { .mfb
553 nop.m 0
554 // for negative x
555 (p15) fms.s.s0 f8 = fPolA, fArgSqrSgn, fPolATmp
556 br.ret.sptk b0 // Exit for 8.0 <=|x|< 9.125
557 };;
558
559 // Here if x is single precision denormal
560 tanhf_denormal:
561 { .mfi
562 nop.m 0
563 fclass.m p7,p8 = f8, 0x0a // is x -denormal ?
564 nop.i 0
565 }
566 ;;
567
568 { .mfi
569 nop.m 0
570 (p7) fma.s.s0 f8 = f8,f8,f8 // -denormal
571 nop.i 0
572 }
573 { .mfb
574 nop.m 0
575 (p8) fnma.s.s0 f8 = f8,f8,f8 // +denormal
576 br.ret.sptk b0 // Exit for denormal
577 }
578 ;;
579
580 GLOBAL_LIBM_END(tanhf)
581 libm_alias_float_other (tanh, tanh)