initial commit
[glibc.git] / sysdeps / ia64 / fpu / e_sinhl.S
1 .file "sinhl.s"
2
3
4 // Copyright (c) 2000 - 2002, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 //
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
18 //
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
22
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
38 //
39 // History
40 //==============================================================
41 // 02/02/00 Initial version
42 // 04/04/00 Unwind support added
43 // 08/15/00 Bundle added after call to __libm_error_support to properly
44 // set [the previously overwritten] GR_Parameter_RESULT.
45 // 10/12/00 Update to set denormal operand and underflow flags
46 // 01/22/01 Fixed to set inexact flag for small args. Fixed incorrect
47 // call to __libm_error_support for 710.476 < x < 11357.2166.
48 // 05/02/01 Reworked to improve speed of all paths
49 // 05/20/02 Cleaned up namespace and sf0 syntax
50 // 12/04/02 Improved performance
51 //
52 // API
53 //==============================================================
54 // long double = sinhl(long double)
55 // input floating point f8
56 // output floating point f8
57 //
58 // Registers used
59 //==============================================================
60 // general registers:
61 // r14 -> r40
62 // predicate registers used:
63 // p6 -> p11
64 // floating-point registers used:
65 // f9 -> f15; f32 -> f90;
66 // f8 has input, then output
67 //
68 // Overview of operation
69 //==============================================================
70 // There are seven paths
71 // 1. 0 < |x| < 0.25 SINH_BY_POLY
72 // 2. 0.25 <=|x| < 32 SINH_BY_TBL
73 // 3. 32 <= |x| < 11357.21655 SINH_BY_EXP (merged path with SINH_BY_TBL)
74 // 4. |x| >= 11357.21655 SINH_HUGE
75 // 5. x=0 Done with early exit
76 // 6. x=inf,nan Done with early exit
77 // 7. x=denormal SINH_DENORM
78 //
79 // For double extended we get overflow for x >= 400c b174 ddc0 31ae c0ea
80 // >= 11357.21655
81 //
82 //
83 // 1. SINH_BY_POLY 0 < |x| < 0.25
84 // ===============
85 // Evaluate sinh(x) by a 13th order polynomial
86 // Care is take for the order of multiplication; and P_1 is not exactly 1/3!,
87 // P_2 is not exactly 1/5!, etc.
88 // sinh(x) = sign * (series(e^x) - series(e^-x))/2
89 // = sign * (ax + ax^3/3! + ax^5/5! + ax^7/7! + ax^9/9! + ax^11/11!
90 // + ax^13/13!)
91 // = sign * (ax + ax * ( ax^2 * (1/3! + ax^4 * (1/7! + ax^4*1/11!)) )
92 // + ax * ( ax^4 * (1/5! + ax^4 * (1/9! + ax^4*1/13!)) ))
93 // = sign * (ax + ax*p_odd + (ax*p_even))
94 // = sign * (ax + Y_lo)
95 // sinh(x) = sign * (Y_hi + Y_lo)
96 // Note that ax = |x|
97 //
98 // 2. SINH_BY_TBL 0.25 <= |x| < 32.0
99 // =============
100 // sinh(x) = sinh(B+R)
101 // = sinh(B)cosh(R) + cosh(B)sinh(R)
102 //
103 // ax = |x| = M*log2/64 + R
104 // B = M*log2/64
105 // M = 64*N + j
106 // We will calculate M and get N as (M-j)/64
107 // The division is a shift.
108 // exp(B) = exp(N*log2 + j*log2/64)
109 // = 2^N * 2^(j*log2/64)
110 // sinh(B) = 1/2(e^B -e^-B)
111 // = 1/2(2^N * 2^(j*log2/64) - 2^-N * 2^(-j*log2/64))
112 // sinh(B) = (2^(N-1) * 2^(j*log2/64) - 2^(-N-1) * 2^(-j*log2/64))
113 // cosh(B) = (2^(N-1) * 2^(j*log2/64) + 2^(-N-1) * 2^(-j*log2/64))
114 // 2^(j*log2/64) is stored as Tjhi + Tjlo , j= -32,....,32
115 // Tjhi is double-extended (80-bit) and Tjlo is single(32-bit)
116 //
117 // R = ax - M*log2/64
118 // R = ax - M*log2_by_64_hi - M*log2_by_64_lo
119 // exp(R) = 1 + R +R^2(1/2! + R(1/3! + R(1/4! + ... + R(1/n!)...)
120 // = 1 + p_odd + p_even
121 // where the p_even uses the A coefficients and the p_even uses
122 // the B coefficients
123 //
124 // So sinh(R) = 1 + p_odd + p_even -(1 -p_odd -p_even)/2 = p_odd
125 // cosh(R) = 1 + p_even
126 // sinh(B) = S_hi + S_lo
127 // cosh(B) = C_hi
128 // sinh(x) = sinh(B)cosh(R) + cosh(B)sinh(R)
129 //
130 // 3. SINH_BY_EXP 32.0 <= |x| < 11357.21655 ( 400c b174 ddc0 31ae c0ea )
131 // ==============
132 // Can approximate result by exp(x)/2 in this region.
133 // Y_hi = Tjhi
134 // Y_lo = Tjhi * (p_odd + p_even) + Tjlo
135 // sinh(x) = Y_hi + Y_lo
136 //
137 // 4. SINH_HUGE |x| >= 11357.21655 ( 400c b174 ddc0 31ae c0ea )
138 // ============
139 // Set error tag and call error support
140 //
141 //
142 // Assembly macros
143 //==============================================================
144 r_ad5 = r14
145 r_rshf_2to57 = r15
146 r_exp_denorm = r15
147 r_ad_mJ_lo = r15
148 r_ad_J_lo = r16
149 r_2Nm1 = r17
150 r_2mNm1 = r18
151 r_exp_x = r18
152 r_ad_J_hi = r19
153 r_ad2o = r19
154 r_ad_mJ_hi = r20
155 r_mj = r21
156 r_ad2e = r22
157 r_ad3 = r23
158 r_ad1 = r24
159 r_Mmj = r24
160 r_rshf = r25
161 r_M = r25
162 r_N = r25
163 r_jshf = r26
164 r_exp_2tom57 = r26
165 r_j = r26
166 r_exp_mask = r27
167 r_signexp_x = r28
168 r_signexp_sgnx_0_5 = r28
169 r_exp_0_25 = r29
170 r_sig_inv_ln2 = r30
171 r_exp_32 = r30
172 r_exp_huge = r30
173 r_ad4 = r31
174
175 GR_SAVE_PFS = r34
176 GR_SAVE_B0 = r35
177 GR_SAVE_GP = r36
178
179 GR_Parameter_X = r37
180 GR_Parameter_Y = r38
181 GR_Parameter_RESULT = r39
182 GR_Parameter_TAG = r40
183
184
185 f_ABS_X = f9
186 f_X2 = f10
187 f_X4 = f11
188 f_tmp = f14
189 f_RSHF = f15
190
191 f_Inv_log2by64 = f32
192 f_log2by64_lo = f33
193 f_log2by64_hi = f34
194 f_A1 = f35
195
196 f_A2 = f36
197 f_A3 = f37
198 f_Rcub = f38
199 f_M_temp = f39
200 f_R_temp = f40
201
202 f_Rsq = f41
203 f_R = f42
204 f_M = f43
205 f_B1 = f44
206 f_B2 = f45
207
208 f_B3 = f46
209 f_peven_temp1 = f47
210 f_peven_temp2 = f48
211 f_peven = f49
212 f_podd_temp1 = f50
213
214 f_podd_temp2 = f51
215 f_podd = f52
216 f_poly65 = f53
217 f_poly6543 = f53
218 f_poly6to1 = f53
219 f_poly43 = f54
220 f_poly21 = f55
221
222 f_X3 = f56
223 f_INV_LN2_2TO63 = f57
224 f_RSHF_2TO57 = f58
225 f_2TOM57 = f59
226 f_smlst_oflow_input = f60
227
228 f_pre_result = f61
229 f_huge = f62
230 f_spos = f63
231 f_sneg = f64
232 f_Tjhi = f65
233
234 f_Tjlo = f66
235 f_Tmjhi = f67
236 f_Tmjlo = f68
237 f_S_hi = f69
238 f_SC_hi_temp = f70
239
240 f_S_lo_temp1 = f71
241 f_S_lo_temp2 = f72
242 f_S_lo_temp3 = f73
243 f_S_lo_temp4 = f73
244 f_S_lo = f74
245 f_C_hi = f75
246
247 f_Y_hi = f77
248 f_Y_lo_temp = f78
249 f_Y_lo = f79
250 f_NORM_X = f80
251
252 f_P1 = f81
253 f_P2 = f82
254 f_P3 = f83
255 f_P4 = f84
256 f_P5 = f85
257
258 f_P6 = f86
259 f_Tjhi_spos = f87
260 f_Tjlo_spos = f88
261 f_huge = f89
262 f_signed_hi_lo = f90
263
264
265 // Data tables
266 //==============================================================
267
268 // DO NOT CHANGE ORDER OF THESE TABLES
269 RODATA
270
271 .align 16
272 LOCAL_OBJECT_START(sinh_arg_reduction)
273 // data8 0xB8AA3B295C17F0BC, 0x00004005 // 64/log2 -- signif loaded with setf
274 data8 0xB17217F7D1000000, 0x00003FF8 // log2/64 high part
275 data8 0xCF79ABC9E3B39804, 0x00003FD0 // log2/64 low part
276 data8 0xb174ddc031aec0ea, 0x0000400c // Smallest x to overflow (11357.21655)
277 LOCAL_OBJECT_END(sinh_arg_reduction)
278
279 LOCAL_OBJECT_START(sinh_p_table)
280 data8 0xB08AF9AE78C1239F, 0x00003FDE // P6
281 data8 0xB8EF1D28926D8891, 0x00003FEC // P4
282 data8 0x8888888888888412, 0x00003FF8 // P2
283 data8 0xD732377688025BE9, 0x00003FE5 // P5
284 data8 0xD00D00D00D4D39F2, 0x00003FF2 // P3
285 data8 0xAAAAAAAAAAAAAAAB, 0x00003FFC // P1
286 LOCAL_OBJECT_END(sinh_p_table)
287
288 LOCAL_OBJECT_START(sinh_ab_table)
289 data8 0xAAAAAAAAAAAAAAAC, 0x00003FFC // A1
290 data8 0x88888888884ECDD5, 0x00003FF8 // A2
291 data8 0xD00D0C6DCC26A86B, 0x00003FF2 // A3
292 data8 0x8000000000000002, 0x00003FFE // B1
293 data8 0xAAAAAAAAAA402C77, 0x00003FFA // B2
294 data8 0xB60B6CC96BDB144D, 0x00003FF5 // B3
295 LOCAL_OBJECT_END(sinh_ab_table)
296
297 LOCAL_OBJECT_START(sinh_j_hi_table)
298 data8 0xB504F333F9DE6484, 0x00003FFE
299 data8 0xB6FD91E328D17791, 0x00003FFE
300 data8 0xB8FBAF4762FB9EE9, 0x00003FFE
301 data8 0xBAFF5AB2133E45FB, 0x00003FFE
302 data8 0xBD08A39F580C36BF, 0x00003FFE
303 data8 0xBF1799B67A731083, 0x00003FFE
304 data8 0xC12C4CCA66709456, 0x00003FFE
305 data8 0xC346CCDA24976407, 0x00003FFE
306 data8 0xC5672A115506DADD, 0x00003FFE
307 data8 0xC78D74C8ABB9B15D, 0x00003FFE
308 data8 0xC9B9BD866E2F27A3, 0x00003FFE
309 data8 0xCBEC14FEF2727C5D, 0x00003FFE
310 data8 0xCE248C151F8480E4, 0x00003FFE
311 data8 0xD06333DAEF2B2595, 0x00003FFE
312 data8 0xD2A81D91F12AE45A, 0x00003FFE
313 data8 0xD4F35AABCFEDFA1F, 0x00003FFE
314 data8 0xD744FCCAD69D6AF4, 0x00003FFE
315 data8 0xD99D15C278AFD7B6, 0x00003FFE
316 data8 0xDBFBB797DAF23755, 0x00003FFE
317 data8 0xDE60F4825E0E9124, 0x00003FFE
318 data8 0xE0CCDEEC2A94E111, 0x00003FFE
319 data8 0xE33F8972BE8A5A51, 0x00003FFE
320 data8 0xE5B906E77C8348A8, 0x00003FFE
321 data8 0xE8396A503C4BDC68, 0x00003FFE
322 data8 0xEAC0C6E7DD24392F, 0x00003FFE
323 data8 0xED4F301ED9942B84, 0x00003FFE
324 data8 0xEFE4B99BDCDAF5CB, 0x00003FFE
325 data8 0xF281773C59FFB13A, 0x00003FFE
326 data8 0xF5257D152486CC2C, 0x00003FFE
327 data8 0xF7D0DF730AD13BB9, 0x00003FFE
328 data8 0xFA83B2DB722A033A, 0x00003FFE
329 data8 0xFD3E0C0CF486C175, 0x00003FFE
330 data8 0x8000000000000000, 0x00003FFF // Center of table
331 data8 0x8164D1F3BC030773, 0x00003FFF
332 data8 0x82CD8698AC2BA1D7, 0x00003FFF
333 data8 0x843A28C3ACDE4046, 0x00003FFF
334 data8 0x85AAC367CC487B15, 0x00003FFF
335 data8 0x871F61969E8D1010, 0x00003FFF
336 data8 0x88980E8092DA8527, 0x00003FFF
337 data8 0x8A14D575496EFD9A, 0x00003FFF
338 data8 0x8B95C1E3EA8BD6E7, 0x00003FFF
339 data8 0x8D1ADF5B7E5BA9E6, 0x00003FFF
340 data8 0x8EA4398B45CD53C0, 0x00003FFF
341 data8 0x9031DC431466B1DC, 0x00003FFF
342 data8 0x91C3D373AB11C336, 0x00003FFF
343 data8 0x935A2B2F13E6E92C, 0x00003FFF
344 data8 0x94F4EFA8FEF70961, 0x00003FFF
345 data8 0x96942D3720185A00, 0x00003FFF
346 data8 0x9837F0518DB8A96F, 0x00003FFF
347 data8 0x99E0459320B7FA65, 0x00003FFF
348 data8 0x9B8D39B9D54E5539, 0x00003FFF
349 data8 0x9D3ED9A72CFFB751, 0x00003FFF
350 data8 0x9EF5326091A111AE, 0x00003FFF
351 data8 0xA0B0510FB9714FC2, 0x00003FFF
352 data8 0xA27043030C496819, 0x00003FFF
353 data8 0xA43515AE09E6809E, 0x00003FFF
354 data8 0xA5FED6A9B15138EA, 0x00003FFF
355 data8 0xA7CD93B4E965356A, 0x00003FFF
356 data8 0xA9A15AB4EA7C0EF8, 0x00003FFF
357 data8 0xAB7A39B5A93ED337, 0x00003FFF
358 data8 0xAD583EEA42A14AC6, 0x00003FFF
359 data8 0xAF3B78AD690A4375, 0x00003FFF
360 data8 0xB123F581D2AC2590, 0x00003FFF
361 data8 0xB311C412A9112489, 0x00003FFF
362 data8 0xB504F333F9DE6484, 0x00003FFF
363 LOCAL_OBJECT_END(sinh_j_hi_table)
364
365 LOCAL_OBJECT_START(sinh_j_lo_table)
366 data4 0x1EB2FB13
367 data4 0x1CE2CBE2
368 data4 0x1DDC3CBC
369 data4 0x1EE9AA34
370 data4 0x9EAEFDC1
371 data4 0x9DBF517B
372 data4 0x1EF88AFB
373 data4 0x1E03B216
374 data4 0x1E78AB43
375 data4 0x9E7B1747
376 data4 0x9EFE3C0E
377 data4 0x9D36F837
378 data4 0x9DEE53E4
379 data4 0x9E24AE8E
380 data4 0x1D912473
381 data4 0x1EB243BE
382 data4 0x1E669A2F
383 data4 0x9BBC610A
384 data4 0x1E761035
385 data4 0x9E0BE175
386 data4 0x1CCB12A1
387 data4 0x1D1BFE90
388 data4 0x1DF2F47A
389 data4 0x1EF22F22
390 data4 0x9E3F4A29
391 data4 0x1EC01A5B
392 data4 0x1E8CAC3A
393 data4 0x9DBB3FAB
394 data4 0x1EF73A19
395 data4 0x9BB795B5
396 data4 0x1EF84B76
397 data4 0x9EF5818B
398 data4 0x00000000 // Center of table
399 data4 0x1F77CACA
400 data4 0x1EF8A91D
401 data4 0x1E57C976
402 data4 0x9EE8DA92
403 data4 0x1EE85C9F
404 data4 0x1F3BF1AF
405 data4 0x1D80CA1E
406 data4 0x9D0373AF
407 data4 0x9F167097
408 data4 0x1EB70051
409 data4 0x1F6EB029
410 data4 0x1DFD6D8E
411 data4 0x9EB319B0
412 data4 0x1EBA2BEB
413 data4 0x1F11D537
414 data4 0x1F0D5A46
415 data4 0x9E5E7BCA
416 data4 0x9F3AAFD1
417 data4 0x9E86DACC
418 data4 0x9F3EDDC2
419 data4 0x1E496E3D
420 data4 0x9F490BF6
421 data4 0x1DD1DB48
422 data4 0x1E65EBFB
423 data4 0x9F427496
424 data4 0x1F283C4A
425 data4 0x1F4B0047
426 data4 0x1F130152
427 data4 0x9E8367C0
428 data4 0x9F705F90
429 data4 0x1EFB3C53
430 data4 0x1F32FB13
431 LOCAL_OBJECT_END(sinh_j_lo_table)
432
433
434 .section .text
435 GLOBAL_IEEE754_ENTRY(sinhl)
436
437 { .mlx
438 getf.exp r_signexp_x = f8 // Get signexp of x, must redo if unorm
439 movl r_sig_inv_ln2 = 0xb8aa3b295c17f0bc // significand of 1/ln2
440 }
441 { .mlx
442 addl r_ad1 = @ltoff(sinh_arg_reduction), gp
443 movl r_rshf_2to57 = 0x4778000000000000 // 1.10000 2^(63+57)
444 }
445 ;;
446
447 { .mfi
448 ld8 r_ad1 = [r_ad1]
449 fmerge.s f_ABS_X = f0,f8
450 mov r_exp_0_25 = 0x0fffd // Form exponent for 0.25
451 }
452 { .mfi
453 nop.m 0
454 fnorm.s1 f_NORM_X = f8
455 mov r_exp_2tom57 = 0xffff-57
456 }
457 ;;
458
459 { .mfi
460 setf.d f_RSHF_2TO57 = r_rshf_2to57 // Form const 1.100 * 2^120
461 fclass.m p10,p0 = f8, 0x0b // Test for denorm
462 mov r_exp_mask = 0x1ffff
463 }
464 { .mlx
465 setf.sig f_INV_LN2_2TO63 = r_sig_inv_ln2 // Form 1/ln2 * 2^63
466 movl r_rshf = 0x43e8000000000000 // 1.1000 2^63 for right shift
467 }
468 ;;
469
470 { .mfi
471 nop.m 0
472 fclass.m p7,p0 = f8, 0x07 // Test if x=0
473 nop.i 0
474 }
475 { .mfi
476 setf.exp f_2TOM57 = r_exp_2tom57 // Form 2^-57 for scaling
477 nop.f 0
478 add r_ad3 = 0x90, r_ad1 // Point to ab_table
479 }
480 ;;
481
482 { .mfi
483 setf.d f_RSHF = r_rshf // Form right shift const 1.100 * 2^63
484 fclass.m p6,p0 = f8, 0xe3 // Test if x nan, inf
485 add r_ad4 = 0x2f0, r_ad1 // Point to j_hi_table midpoint
486 }
487 { .mib
488 add r_ad2e = 0x20, r_ad1 // Point to p_table
489 nop.i 0
490 (p10) br.cond.spnt SINH_DENORM // Branch if x denorm
491 }
492 ;;
493
494 // Common path -- return here from SINH_DENORM if x is unnorm
495 SINH_COMMON:
496 { .mfi
497 ldfe f_smlst_oflow_input = [r_ad2e],16
498 nop.f 0
499 add r_ad5 = 0x580, r_ad1 // Point to j_lo_table midpoint
500 }
501 { .mib
502 ldfe f_log2by64_hi = [r_ad1],16
503 and r_exp_x = r_exp_mask, r_signexp_x
504 (p7) br.ret.spnt b0 // Exit if x=0
505 }
506 ;;
507
508 // Get the A coefficients for SINH_BY_TBL
509 { .mfi
510 ldfe f_A1 = [r_ad3],16
511 fcmp.lt.s1 p8,p9 = f8,f0 // Test for x<0
512 cmp.lt p7,p0 = r_exp_x, r_exp_0_25 // Test x < 0.25
513 }
514 { .mfb
515 add r_ad2o = 0x30, r_ad2e // Point to p_table odd coeffs
516 (p6) fma.s0 f8 = f8,f1,f0 // Result for x nan, inf
517 (p6) br.ret.spnt b0 // Exit for x nan, inf
518 }
519 ;;
520
521 // Calculate X2 = ax*ax for SINH_BY_POLY
522 { .mfi
523 ldfe f_log2by64_lo = [r_ad1],16
524 nop.f 0
525 nop.i 0
526 }
527 { .mfb
528 ldfe f_A2 = [r_ad3],16
529 fma.s1 f_X2 = f_NORM_X, f_NORM_X, f0
530 (p7) br.cond.spnt SINH_BY_POLY
531 }
532 ;;
533
534 // Here if |x| >= 0.25
535 SINH_BY_TBL:
536 // ******************************************************
537 // STEP 1 (TBL and EXP) - Argument reduction
538 // ******************************************************
539 // Get the following constants.
540 // Inv_log2by64
541 // log2by64_hi
542 // log2by64_lo
543
544
545 // We want 2^(N-1) and 2^(-N-1). So bias N-1 and -N-1 and
546 // put them in an exponent.
547 // f_spos = 2^(N-1) and f_sneg = 2^(-N-1)
548 // 0xffff + (N-1) = 0xffff +N -1
549 // 0xffff - (N +1) = 0xffff -N -1
550
551
552 // Calculate M and keep it as integer and floating point.
553 // M = round-to-integer(x*Inv_log2by64)
554 // f_M = M = truncate(ax/(log2/64))
555 // Put the integer representation of M in r_M
556 // and the floating point representation of M in f_M
557
558 // Get the remaining A,B coefficients
559 { .mmi
560 ldfe f_A3 = [r_ad3],16
561 nop.m 0
562 nop.i 0
563 }
564 ;;
565
566 .pred.rel "mutex",p8,p9
567 // Use constant (1.100*2^(63-6)) to get rounded M into rightmost significand
568 // |x| * 64 * 1/ln2 * 2^(63-6) + 1.1000 * 2^(63+(63-6))
569 { .mfi
570 (p8) mov r_signexp_sgnx_0_5 = 0x2fffe // signexp of -0.5
571 fma.s1 f_M_temp = f_ABS_X, f_INV_LN2_2TO63, f_RSHF_2TO57
572 (p9) mov r_signexp_sgnx_0_5 = 0x0fffe // signexp of +0.5
573 }
574 ;;
575
576 // Test for |x| >= overflow limit
577 { .mfi
578 ldfe f_B1 = [r_ad3],16
579 fcmp.ge.s1 p6,p0 = f_ABS_X, f_smlst_oflow_input
580 nop.i 0
581 }
582 ;;
583
584 { .mfi
585 ldfe f_B2 = [r_ad3],16
586 nop.f 0
587 mov r_exp_32 = 0x10004
588 }
589 ;;
590
591 // Subtract RSHF constant to get rounded M as a floating point value
592 // M_temp * 2^(63-6) - 2^63
593 { .mfb
594 ldfe f_B3 = [r_ad3],16
595 fms.s1 f_M = f_M_temp, f_2TOM57, f_RSHF
596 (p6) br.cond.spnt SINH_HUGE // Branch if result will overflow
597 }
598 ;;
599
600 { .mfi
601 getf.sig r_M = f_M_temp
602 nop.f 0
603 cmp.ge p7,p6 = r_exp_x, r_exp_32 // Test if x >= 32
604 }
605 ;;
606
607 // Calculate j. j is the signed extension of the six lsb of M. It
608 // has a range of -32 thru 31.
609
610 // Calculate R
611 // ax - M*log2by64_hi
612 // R = (ax - M*log2by64_hi) - M*log2by64_lo
613
614 { .mfi
615 nop.m 0
616 fnma.s1 f_R_temp = f_M, f_log2by64_hi, f_ABS_X
617 and r_j = 0x3f, r_M
618 }
619 ;;
620
621 { .mii
622 nop.m 0
623 shl r_jshf = r_j, 0x2 // Shift j so can sign extend it
624 ;;
625 sxt1 r_jshf = r_jshf
626 }
627 ;;
628
629 { .mii
630 nop.m 0
631 shr r_j = r_jshf, 0x2 // Now j has range -32 to 31
632 nop.i 0
633 }
634 ;;
635
636 { .mmi
637 shladd r_ad_J_hi = r_j, 4, r_ad4 // pointer to Tjhi
638 sub r_Mmj = r_M, r_j // M-j
639 sub r_mj = r0, r_j // Form -j
640 }
641 ;;
642
643 // The TBL and EXP branches are merged and predicated
644 // If TBL, p6 true, 0.25 <= |x| < 32
645 // If EXP, p7 true, 32 <= |x| < overflow_limit
646 //
647 // N = (M-j)/64
648 { .mfi
649 ldfe f_Tjhi = [r_ad_J_hi]
650 fnma.s1 f_R = f_M, f_log2by64_lo, f_R_temp
651 shr r_N = r_Mmj, 0x6 // N = (M-j)/64
652 }
653 { .mfi
654 shladd r_ad_mJ_hi = r_mj, 4, r_ad4 // pointer to Tmjhi
655 nop.f 0
656 shladd r_ad_mJ_lo = r_mj, 2, r_ad5 // pointer to Tmjlo
657 }
658 ;;
659
660 { .mfi
661 sub r_2mNm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(-N-1)
662 nop.f 0
663 shladd r_ad_J_lo = r_j, 2, r_ad5 // pointer to Tjlo
664 }
665 { .mfi
666 ldfe f_Tmjhi = [r_ad_mJ_hi]
667 nop.f 0
668 add r_2Nm1 = r_signexp_sgnx_0_5, r_N // signexp sgnx*2^(N-1)
669 }
670 ;;
671
672 { .mmf
673 ldfs f_Tmjlo = [r_ad_mJ_lo]
674 setf.exp f_sneg = r_2mNm1 // Form sgnx * 2^(-N-1)
675 nop.f 0
676 }
677 ;;
678
679 { .mmf
680 ldfs f_Tjlo = [r_ad_J_lo]
681 setf.exp f_spos = r_2Nm1 // Form sgnx * 2^(N-1)
682 nop.f 0
683 }
684 ;;
685
686 // ******************************************************
687 // STEP 2 (TBL and EXP)
688 // ******************************************************
689 // Calculate Rsquared and Rcubed in preparation for p_even and p_odd
690
691 { .mmf
692 nop.m 0
693 nop.m 0
694 fma.s1 f_Rsq = f_R, f_R, f0
695 }
696 ;;
697
698
699 // Calculate p_even
700 // B_2 + Rsq *B_3
701 // B_1 + Rsq * (B_2 + Rsq *B_3)
702 // p_even = Rsq * (B_1 + Rsq * (B_2 + Rsq *B_3))
703 { .mfi
704 nop.m 0
705 fma.s1 f_peven_temp1 = f_Rsq, f_B3, f_B2
706 nop.i 0
707 }
708 // Calculate p_odd
709 // A_2 + Rsq *A_3
710 // A_1 + Rsq * (A_2 + Rsq *A_3)
711 // podd = R + Rcub * (A_1 + Rsq * (A_2 + Rsq *A_3))
712 { .mfi
713 nop.m 0
714 fma.s1 f_podd_temp1 = f_Rsq, f_A3, f_A2
715 nop.i 0
716 }
717 ;;
718
719 { .mfi
720 nop.m 0
721 fma.s1 f_Rcub = f_Rsq, f_R, f0
722 nop.i 0
723 }
724 ;;
725
726 //
727 // If TBL,
728 // Calculate S_hi and S_lo, and C_hi
729 // SC_hi_temp = sneg * Tmjhi
730 // S_hi = spos * Tjhi - SC_hi_temp
731 // S_hi = spos * Tjhi - (sneg * Tmjhi)
732 // C_hi = spos * Tjhi + SC_hi_temp
733 // C_hi = spos * Tjhi + (sneg * Tmjhi)
734
735 { .mfi
736 nop.m 0
737 (p6) fma.s1 f_SC_hi_temp = f_sneg, f_Tmjhi, f0
738 nop.i 0
739 }
740 ;;
741
742 // If TBL,
743 // S_lo_temp3 = sneg * Tmjlo
744 // S_lo_temp4 = spos * Tjlo - S_lo_temp3
745 // S_lo_temp4 = spos * Tjlo -(sneg * Tmjlo)
746 { .mfi
747 nop.m 0
748 (p6) fma.s1 f_S_lo_temp3 = f_sneg, f_Tmjlo, f0
749 nop.i 0
750 }
751 ;;
752
753 { .mfi
754 nop.m 0
755 fma.s1 f_peven_temp2 = f_Rsq, f_peven_temp1, f_B1
756 nop.i 0
757 }
758 { .mfi
759 nop.m 0
760 fma.s1 f_podd_temp2 = f_Rsq, f_podd_temp1, f_A1
761 nop.i 0
762 }
763 ;;
764
765 // If EXP,
766 // Compute sgnx * 2^(N-1) * Tjhi and sgnx * 2^(N-1) * Tjlo
767 { .mfi
768 nop.m 0
769 (p7) fma.s1 f_Tjhi_spos = f_Tjhi, f_spos, f0
770 nop.i 0
771 }
772 { .mfi
773 nop.m 0
774 (p7) fma.s1 f_Tjlo_spos = f_Tjlo, f_spos, f0
775 nop.i 0
776 }
777 ;;
778
779 { .mfi
780 nop.m 0
781 (p6) fms.s1 f_S_hi = f_spos, f_Tjhi, f_SC_hi_temp
782 nop.i 0
783 }
784 ;;
785
786 { .mfi
787 nop.m 0
788 (p6) fma.s1 f_C_hi = f_spos, f_Tjhi, f_SC_hi_temp
789 nop.i 0
790 }
791 { .mfi
792 nop.m 0
793 (p6) fms.s1 f_S_lo_temp4 = f_spos, f_Tjlo, f_S_lo_temp3
794 nop.i 0
795 }
796 ;;
797
798 { .mfi
799 nop.m 0
800 fma.s1 f_peven = f_Rsq, f_peven_temp2, f0
801 nop.i 0
802 }
803 { .mfi
804 nop.m 0
805 fma.s1 f_podd = f_podd_temp2, f_Rcub, f_R
806 nop.i 0
807 }
808 ;;
809
810 // If TBL,
811 // S_lo_temp1 = spos * Tjhi - S_hi
812 // S_lo_temp2 = -sneg * Tmjlo + S_lo_temp1
813 // S_lo_temp2 = -sneg * Tmjlo + (spos * Tjhi - S_hi)
814
815 { .mfi
816 nop.m 0
817 (p6) fms.s1 f_S_lo_temp1 = f_spos, f_Tjhi, f_S_hi
818 nop.i 0
819 }
820 ;;
821
822 { .mfi
823 nop.m 0
824 (p6) fnma.s1 f_S_lo_temp2 = f_sneg, f_Tmjhi, f_S_lo_temp1
825 nop.i 0
826 }
827 ;;
828
829 // If EXP,
830 // Y_hi = sgnx * 2^(N-1) * Tjhi
831 // Y_lo = sgnx * 2^(N-1) * Tjhi * (p_odd + p_even) + sgnx * 2^(N-1) * Tjlo
832 { .mfi
833 nop.m 0
834 (p7) fma.s1 f_Y_lo_temp = f_peven, f1, f_podd
835 nop.i 0
836 }
837 ;;
838
839 // If TBL,
840 // S_lo = S_lo_temp4 + S_lo_temp2
841 { .mfi
842 nop.m 0
843 (p6) fma.s1 f_S_lo = f_S_lo_temp4, f1, f_S_lo_temp2
844 nop.i 0
845 }
846 ;;
847
848 // If TBL,
849 // Y_hi = S_hi
850 // Y_lo = C_hi*p_odd + (S_hi*p_even + S_lo)
851 { .mfi
852 nop.m 0
853 (p6) fma.s1 f_Y_lo_temp = f_S_hi, f_peven, f_S_lo
854 nop.i 0
855 }
856 ;;
857
858 { .mfi
859 nop.m 0
860 (p7) fma.s1 f_Y_lo = f_Tjhi_spos, f_Y_lo_temp, f_Tjlo_spos
861 nop.i 0
862 }
863 ;;
864
865 // Dummy multiply to generate inexact
866 { .mfi
867 nop.m 0
868 fmpy.s0 f_tmp = f_B2, f_B2
869 nop.i 0
870 }
871 { .mfi
872 nop.m 0
873 (p6) fma.s1 f_Y_lo = f_C_hi, f_podd, f_Y_lo_temp
874 nop.i 0
875 }
876 ;;
877
878 // f8 = answer = Y_hi + Y_lo
879 { .mfi
880 nop.m 0
881 (p7) fma.s0 f8 = f_Y_lo, f1, f_Tjhi_spos
882 nop.i 0
883 }
884 ;;
885
886 // f8 = answer = Y_hi + Y_lo
887 { .mfb
888 nop.m 0
889 (p6) fma.s0 f8 = f_Y_lo, f1, f_S_hi
890 br.ret.sptk b0 // Exit for SINH_BY_TBL and SINH_BY_EXP
891 }
892 ;;
893
894
895 // Here if 0 < |x| < 0.25
896 SINH_BY_POLY:
897 { .mmf
898 ldfe f_P6 = [r_ad2e],16
899 ldfe f_P5 = [r_ad2o],16
900 nop.f 0
901 }
902 ;;
903
904 { .mmi
905 ldfe f_P4 = [r_ad2e],16
906 ldfe f_P3 = [r_ad2o],16
907 nop.i 0
908 }
909 ;;
910
911 { .mmi
912 ldfe f_P2 = [r_ad2e],16
913 ldfe f_P1 = [r_ad2o],16
914 nop.i 0
915 }
916 ;;
917
918 { .mfi
919 nop.m 0
920 fma.s1 f_X3 = f_NORM_X, f_X2, f0
921 nop.i 0
922 }
923 { .mfi
924 nop.m 0
925 fma.s1 f_X4 = f_X2, f_X2, f0
926 nop.i 0
927 }
928 ;;
929
930 { .mfi
931 nop.m 0
932 fma.s1 f_poly65 = f_X2, f_P6, f_P5
933 nop.i 0
934 }
935 { .mfi
936 nop.m 0
937 fma.s1 f_poly43 = f_X2, f_P4, f_P3
938 nop.i 0
939 }
940 ;;
941
942 { .mfi
943 nop.m 0
944 fma.s1 f_poly21 = f_X2, f_P2, f_P1
945 nop.i 0
946 }
947 ;;
948
949 { .mfi
950 nop.m 0
951 fma.s1 f_poly6543 = f_X4, f_poly65, f_poly43
952 nop.i 0
953 }
954 ;;
955
956 { .mfi
957 nop.m 0
958 fma.s1 f_poly6to1 = f_X4, f_poly6543, f_poly21
959 nop.i 0
960 }
961 ;;
962
963 // Dummy multiply to generate inexact
964 { .mfi
965 nop.m 0
966 fmpy.s0 f_tmp = f_P6, f_P6
967 nop.i 0
968 }
969 { .mfb
970 nop.m 0
971 fma.s0 f8 = f_poly6to1, f_X3, f_NORM_X
972 br.ret.sptk b0 // Exit SINH_BY_POLY
973 }
974 ;;
975
976
977 // Here if x denorm or unorm
978 SINH_DENORM:
979 // Determine if x really a denorm and not a unorm
980 { .mmf
981 getf.exp r_signexp_x = f_NORM_X
982 mov r_exp_denorm = 0x0c001 // Real denorms have exp < this
983 fmerge.s f_ABS_X = f0, f_NORM_X
984 }
985 ;;
986
987 { .mfi
988 nop.m 0
989 fcmp.eq.s0 p10,p0 = f8, f0 // Set denorm flag
990 nop.i 0
991 }
992 ;;
993
994 // Set p8 if really a denorm
995 { .mmi
996 and r_exp_x = r_exp_mask, r_signexp_x
997 ;;
998 cmp.lt p8,p9 = r_exp_x, r_exp_denorm
999 nop.i 0
1000 }
1001 ;;
1002
1003 // Identify denormal operands.
1004 { .mfb
1005 nop.m 0
1006 (p8) fcmp.ge.unc.s1 p6,p7 = f8, f0 // Test sign of denorm
1007 (p9) br.cond.sptk SINH_COMMON // Return to main path if x unorm
1008 }
1009 ;;
1010
1011 { .mfi
1012 nop.m 0
1013 (p6) fma.s0 f8 = f8,f8,f8 // If x +denorm, result=x+x^2
1014 nop.i 0
1015 }
1016 { .mfb
1017 nop.m 0
1018 (p7) fnma.s0 f8 = f8,f8,f8 // If x -denorm, result=x-x^2
1019 br.ret.sptk b0 // Exit if x denorm
1020 }
1021 ;;
1022
1023
1024 // Here if |x| >= overflow limit
1025 SINH_HUGE:
1026 // for SINH_HUGE, put 24000 in exponent; take sign from input
1027 { .mmi
1028 mov r_exp_huge = 0x15dbf
1029 ;;
1030 setf.exp f_huge = r_exp_huge
1031 nop.i 0
1032 }
1033 ;;
1034
1035 .pred.rel "mutex",p8,p9
1036 { .mfi
1037 alloc r32 = ar.pfs,0,5,4,0
1038 (p8) fnma.s1 f_signed_hi_lo = f_huge, f1, f1
1039 nop.i 0
1040 }
1041 { .mfi
1042 nop.m 0
1043 (p9) fma.s1 f_signed_hi_lo = f_huge, f1, f1
1044 nop.i 0
1045 }
1046 ;;
1047
1048 { .mfi
1049 nop.m 0
1050 fma.s0 f_pre_result = f_signed_hi_lo, f_huge, f0
1051 mov GR_Parameter_TAG = 126
1052 }
1053 ;;
1054
1055 GLOBAL_IEEE754_END(sinhl)
1056 libm_alias_ldouble_other (__sinh, sinh)
1057
1058
1059 LOCAL_LIBM_ENTRY(__libm_error_region)
1060 .prologue
1061
1062 { .mfi
1063 add GR_Parameter_Y=-32,sp // Parameter 2 value
1064 nop.f 0
1065 .save ar.pfs,GR_SAVE_PFS
1066 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1067 }
1068 { .mfi
1069 .fframe 64
1070 add sp=-64,sp // Create new stack
1071 nop.f 0
1072 mov GR_SAVE_GP=gp // Save gp
1073 };;
1074
1075 { .mmi
1076 stfe [GR_Parameter_Y] = f0,16 // STORE Parameter 2 on stack
1077 add GR_Parameter_X = 16,sp // Parameter 1 address
1078 .save b0, GR_SAVE_B0
1079 mov GR_SAVE_B0=b0 // Save b0
1080 };;
1081
1082 .body
1083 { .mib
1084 stfe [GR_Parameter_X] = f8 // STORE Parameter 1 on stack
1085 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1086 nop.b 0
1087 }
1088 { .mib
1089 stfe [GR_Parameter_Y] = f_pre_result // STORE Parameter 3 on stack
1090 add GR_Parameter_Y = -16,GR_Parameter_Y
1091 br.call.sptk b0=__libm_error_support# // Call error handling function
1092 };;
1093
1094 { .mmi
1095 add GR_Parameter_RESULT = 48,sp
1096 nop.m 0
1097 nop.i 0
1098 };;
1099
1100 { .mmi
1101 ldfe f8 = [GR_Parameter_RESULT] // Get return result off stack
1102 .restore sp
1103 add sp = 64,sp // Restore stack pointer
1104 mov b0 = GR_SAVE_B0 // Restore return address
1105 };;
1106
1107 { .mib
1108 mov gp = GR_SAVE_GP // Restore gp
1109 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1110 br.ret.sptk b0 // Return
1111 };;
1112
1113 LOCAL_LIBM_END(__libm_error_region)
1114
1115
1116 .type __libm_error_support#,@function
1117 .global __libm_error_support#