initial commit
[glibc.git] / sysdeps / ia64 / fpu / s_asinhl.S
1 .file "asinhl.s"
2
3
4 // Copyright (c) 2000 - 2003, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 //
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
18 //
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
22
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
38 //
39 //*********************************************************************
40 //
41 // History:
42 // 09/04/01 Initial version
43 // 09/13/01 Performance improved, symmetry problems fixed
44 // 10/10/01 Performance improved, split issues removed
45 // 12/11/01 Changed huges_logp to not be global
46 // 05/20/02 Cleaned up namespace and sf0 syntax
47 // 02/10/03 Reordered header: .section, .global, .proc, .align;
48 // used data8 for long double table values
49 //
50 //*********************************************************************
51 //
52 // API
53 //==============================================================
54 // long double asinhl(long double);
55 //
56 // Overview of operation
57 //==============================================================
58 //
59 // There are 6 paths:
60 // 1. x = 0, [S,Q]Nan or +/-INF
61 // Return asinhl(x) = x + x;
62 //
63 // 2. x = + denormal
64 // Return asinhl(x) = x - x^2;
65 //
66 // 3. x = - denormal
67 // Return asinhl(x) = x + x^2;
68 //
69 // 4. 'Near 0': max denormal < |x| < 1/128
70 // Return asinhl(x) = sign(x)*(x+x^3*(c3+x^2*(c5+x^2*(c7+x^2*(c9)))));
71 //
72 // 5. 'Huges': |x| > 2^63
73 // Return asinhl(x) = sign(x)*(logl(2*x));
74 //
75 // 6. 'Main path': 1/128 < |x| < 2^63
76 // b_hi + b_lo = x + sqrt(x^2 + 1);
77 // asinhl(x) = sign(x)*(log_special(b_hi, b_lo));
78 //
79 // Algorithm description
80 //==============================================================
81 //
82 // Main path algorithm
83 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
84 // *************************************************************************
85 //
86 // There are 3 parts of x+sqrt(x^2+1) computation:
87 //
88 // 1) p2 = (p2_hi+p2_lo) = x^2+1 obtaining
89 // ------------------------------------
90 // p2_hi = x2_hi + 1, where x2_hi = x * x;
91 // p2_lo = x2_lo + p1_lo, where
92 // x2_lo = FMS(x*x-x2_hi),
93 // p1_lo = (1 - p2_hi) + x2_hi;
94 //
95 // 2) g = (g_hi+g_lo) = sqrt(p2) = sqrt(p2_hi+p2_lo)
96 // ----------------------------------------------
97 // r = invsqrt(p2_hi) (8-bit reciprocal square root approximation);
98 // g = p2_hi * r (first 8 bit-approximation of sqrt);
99 //
100 // h = 0.5 * r;
101 // e = 0.5 - g * h;
102 // g = g * e + g (second 16 bit-approximation of sqrt);
103 //
104 // h = h * e + h;
105 // e = 0.5 - g * h;
106 // g = g * e + g (third 32 bit-approximation of sqrt);
107 //
108 // h = h * e + h;
109 // e = 0.5 - g * h;
110 // g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
111 //
112 // Remainder computation:
113 // h = h * e + h;
114 // d = (p2_hi - g_hi * g_hi) + p2_lo;
115 // g_lo = d * h;
116 //
117 // 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2+1)
118 // -------------------------------------------------------------------
119 // b_hi = (g_hi + x) + gl;
120 // b_lo = (g_hi - b_hi) + x + gl;
121 //
122 // Now we pass b presented as sum b_hi + b_lo to special version
123 // of logl function which accept a pair of arguments as
124 // 'mutiprecision' value.
125 //
126 // Special log algorithm overview
127 // ================================
128 // Here we use a table lookup method. The basic idea is that in
129 // order to compute logl(Arg) = logl (Arg-1) for an argument Arg in [1,2),
130 // we construct a value G such that G*Arg is close to 1 and that
131 // logl(1/G) is obtainable easily from a table of values calculated
132 // beforehand. Thus
133 //
134 // logl(Arg) = logl(1/G) + logl((G*Arg - 1))
135 //
136 // Because |G*Arg - 1| is small, the second term on the right hand
137 // side can be approximated by a short polynomial. We elaborate
138 // this method in four steps.
139 //
140 // Step 0: Initialization
141 //
142 // We need to calculate logl( X ). Obtain N, S_hi such that
143 //
144 // X = 2^N * ( S_hi + S_lo ) exactly
145 //
146 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
147 // that |S_lo| <= ulp(S_hi).
148 //
149 // For the special version of logl: S_lo = b_lo
150 // !-----------------------------------------------!
151 //
152 // Step 1: Argument Reduction
153 //
154 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
155 //
156 // G := G_1 * G_2 * G_3
157 // r := (G * S_hi - 1) + G * S_lo
158 //
159 // These G_j's have the property that the product is exactly
160 // representable and that |r| < 2^(-12) as a result.
161 //
162 // Step 2: Approximation
163 //
164 // logl(1 + r) is approximated by a short polynomial poly(r).
165 //
166 // Step 3: Reconstruction
167 //
168 // Finally,
169 //
170 // logl( X ) = logl( 2^N * (S_hi + S_lo) )
171 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
172 // ~=~ N*logl(2) + logl(1/G) + poly(r).
173 //
174 // For detailed description see logl or log1pl function, regular path.
175 //
176 // Registers used
177 //==============================================================
178 // Floating Point registers used:
179 // f8, input
180 // f32 -> f101 (70 registers)
181
182 // General registers used:
183 // r32 -> r57 (26 registers)
184
185 // Predicate registers used:
186 // p6 -> p11
187 // p6 for '0, NaNs, Inf' path
188 // p7 for '+ denormals' path
189 // p8 for 'near 0' path
190 // p9 for 'huges' path
191 // p10 for '- denormals' path
192 // p11 for negative values
193 //
194 // Data tables
195 //==============================================================
196
197 RODATA
198 .align 64
199
200 // C7, C9 'near 0' polynomial coefficients
201 LOCAL_OBJECT_START(Poly_C_near_0_79)
202 data8 0xF8DC939BBEDD5A54, 0x00003FF9
203 data8 0xB6DB6DAB21565AC5, 0x0000BFFA
204 LOCAL_OBJECT_END(Poly_C_near_0_79)
205
206 // C3, C5 'near 0' polynomial coefficients
207 LOCAL_OBJECT_START(Poly_C_near_0_35)
208 data8 0x999999999991D582, 0x00003FFB
209 data8 0xAAAAAAAAAAAAAAA9, 0x0000BFFC
210 LOCAL_OBJECT_END(Poly_C_near_0_35)
211
212 // Q coeffs
213 LOCAL_OBJECT_START(Constants_Q)
214 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
215 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
216 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
217 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
218 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
219 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
220 LOCAL_OBJECT_END(Constants_Q)
221
222 // Z1 - 16 bit fixed
223 LOCAL_OBJECT_START(Constants_Z_1)
224 data4 0x00008000
225 data4 0x00007879
226 data4 0x000071C8
227 data4 0x00006BCB
228 data4 0x00006667
229 data4 0x00006187
230 data4 0x00005D18
231 data4 0x0000590C
232 data4 0x00005556
233 data4 0x000051EC
234 data4 0x00004EC5
235 data4 0x00004BDB
236 data4 0x00004925
237 data4 0x0000469F
238 data4 0x00004445
239 data4 0x00004211
240 LOCAL_OBJECT_END(Constants_Z_1)
241
242 // G1 and H1 - IEEE single and h1 - IEEE double
243 LOCAL_OBJECT_START(Constants_G_H_h1)
244 data4 0x3F800000,0x00000000
245 data8 0x0000000000000000
246 data4 0x3F70F0F0,0x3D785196
247 data8 0x3DA163A6617D741C
248 data4 0x3F638E38,0x3DF13843
249 data8 0x3E2C55E6CBD3D5BB
250 data4 0x3F579430,0x3E2FF9A0
251 data8 0xBE3EB0BFD86EA5E7
252 data4 0x3F4CCCC8,0x3E647FD6
253 data8 0x3E2E6A8C86B12760
254 data4 0x3F430C30,0x3E8B3AE7
255 data8 0x3E47574C5C0739BA
256 data4 0x3F3A2E88,0x3EA30C68
257 data8 0x3E20E30F13E8AF2F
258 data4 0x3F321640,0x3EB9CEC8
259 data8 0xBE42885BF2C630BD
260 data4 0x3F2AAAA8,0x3ECF9927
261 data8 0x3E497F3497E577C6
262 data4 0x3F23D708,0x3EE47FC5
263 data8 0x3E3E6A6EA6B0A5AB
264 data4 0x3F1D89D8,0x3EF8947D
265 data8 0xBDF43E3CD328D9BE
266 data4 0x3F17B420,0x3F05F3A1
267 data8 0x3E4094C30ADB090A
268 data4 0x3F124920,0x3F0F4303
269 data8 0xBE28FBB2FC1FE510
270 data4 0x3F0D3DC8,0x3F183EBF
271 data8 0x3E3A789510FDE3FA
272 data4 0x3F088888,0x3F20EC80
273 data8 0x3E508CE57CC8C98F
274 data4 0x3F042108,0x3F29516A
275 data8 0xBE534874A223106C
276 LOCAL_OBJECT_END(Constants_G_H_h1)
277
278 // Z2 - 16 bit fixed
279 LOCAL_OBJECT_START(Constants_Z_2)
280 data4 0x00008000
281 data4 0x00007F81
282 data4 0x00007F02
283 data4 0x00007E85
284 data4 0x00007E08
285 data4 0x00007D8D
286 data4 0x00007D12
287 data4 0x00007C98
288 data4 0x00007C20
289 data4 0x00007BA8
290 data4 0x00007B31
291 data4 0x00007ABB
292 data4 0x00007A45
293 data4 0x000079D1
294 data4 0x0000795D
295 data4 0x000078EB
296 LOCAL_OBJECT_END(Constants_Z_2)
297
298 // G2 and H2 - IEEE single and h2 - IEEE double
299 LOCAL_OBJECT_START(Constants_G_H_h2)
300 data4 0x3F800000,0x00000000
301 data8 0x0000000000000000
302 data4 0x3F7F00F8,0x3B7F875D
303 data8 0x3DB5A11622C42273
304 data4 0x3F7E03F8,0x3BFF015B
305 data8 0x3DE620CF21F86ED3
306 data4 0x3F7D08E0,0x3C3EE393
307 data8 0xBDAFA07E484F34ED
308 data4 0x3F7C0FC0,0x3C7E0586
309 data8 0xBDFE07F03860BCF6
310 data4 0x3F7B1880,0x3C9E75D2
311 data8 0x3DEA370FA78093D6
312 data4 0x3F7A2328,0x3CBDC97A
313 data8 0x3DFF579172A753D0
314 data4 0x3F792FB0,0x3CDCFE47
315 data8 0x3DFEBE6CA7EF896B
316 data4 0x3F783E08,0x3CFC15D0
317 data8 0x3E0CF156409ECB43
318 data4 0x3F774E38,0x3D0D874D
319 data8 0xBE0B6F97FFEF71DF
320 data4 0x3F766038,0x3D1CF49B
321 data8 0xBE0804835D59EEE8
322 data4 0x3F757400,0x3D2C531D
323 data8 0x3E1F91E9A9192A74
324 data4 0x3F748988,0x3D3BA322
325 data8 0xBE139A06BF72A8CD
326 data4 0x3F73A0D0,0x3D4AE46F
327 data8 0x3E1D9202F8FBA6CF
328 data4 0x3F72B9D0,0x3D5A1756
329 data8 0xBE1DCCC4BA796223
330 data4 0x3F71D488,0x3D693B9D
331 data8 0xBE049391B6B7C239
332 LOCAL_OBJECT_END(Constants_G_H_h2)
333
334 // G3 and H3 - IEEE single and h3 - IEEE double
335 LOCAL_OBJECT_START(Constants_G_H_h3)
336 data4 0x3F7FFC00,0x38800100
337 data8 0x3D355595562224CD
338 data4 0x3F7FF400,0x39400480
339 data8 0x3D8200A206136FF6
340 data4 0x3F7FEC00,0x39A00640
341 data8 0x3DA4D68DE8DE9AF0
342 data4 0x3F7FE400,0x39E00C41
343 data8 0xBD8B4291B10238DC
344 data4 0x3F7FDC00,0x3A100A21
345 data8 0xBD89CCB83B1952CA
346 data4 0x3F7FD400,0x3A300F22
347 data8 0xBDB107071DC46826
348 data4 0x3F7FCC08,0x3A4FF51C
349 data8 0x3DB6FCB9F43307DB
350 data4 0x3F7FC408,0x3A6FFC1D
351 data8 0xBD9B7C4762DC7872
352 data4 0x3F7FBC10,0x3A87F20B
353 data8 0xBDC3725E3F89154A
354 data4 0x3F7FB410,0x3A97F68B
355 data8 0xBD93519D62B9D392
356 data4 0x3F7FAC18,0x3AA7EB86
357 data8 0x3DC184410F21BD9D
358 data4 0x3F7FA420,0x3AB7E101
359 data8 0xBDA64B952245E0A6
360 data4 0x3F7F9C20,0x3AC7E701
361 data8 0x3DB4B0ECAABB34B8
362 data4 0x3F7F9428,0x3AD7DD7B
363 data8 0x3D9923376DC40A7E
364 data4 0x3F7F8C30,0x3AE7D474
365 data8 0x3DC6E17B4F2083D3
366 data4 0x3F7F8438,0x3AF7CBED
367 data8 0x3DAE314B811D4394
368 data4 0x3F7F7C40,0x3B03E1F3
369 data8 0xBDD46F21B08F2DB1
370 data4 0x3F7F7448,0x3B0BDE2F
371 data8 0xBDDC30A46D34522B
372 data4 0x3F7F6C50,0x3B13DAAA
373 data8 0x3DCB0070B1F473DB
374 data4 0x3F7F6458,0x3B1BD766
375 data8 0xBDD65DDC6AD282FD
376 data4 0x3F7F5C68,0x3B23CC5C
377 data8 0xBDCDAB83F153761A
378 data4 0x3F7F5470,0x3B2BC997
379 data8 0xBDDADA40341D0F8F
380 data4 0x3F7F4C78,0x3B33C711
381 data8 0x3DCD1BD7EBC394E8
382 data4 0x3F7F4488,0x3B3BBCC6
383 data8 0xBDC3532B52E3E695
384 data4 0x3F7F3C90,0x3B43BAC0
385 data8 0xBDA3961EE846B3DE
386 data4 0x3F7F34A0,0x3B4BB0F4
387 data8 0xBDDADF06785778D4
388 data4 0x3F7F2CA8,0x3B53AF6D
389 data8 0x3DCC3ED1E55CE212
390 data4 0x3F7F24B8,0x3B5BA620
391 data8 0xBDBA31039E382C15
392 data4 0x3F7F1CC8,0x3B639D12
393 data8 0x3D635A0B5C5AF197
394 data4 0x3F7F14D8,0x3B6B9444
395 data8 0xBDDCCB1971D34EFC
396 data4 0x3F7F0CE0,0x3B7393BC
397 data8 0x3DC7450252CD7ADA
398 data4 0x3F7F04F0,0x3B7B8B6D
399 data8 0xBDB68F177D7F2A42
400 LOCAL_OBJECT_END(Constants_G_H_h3)
401
402 // Assembly macros
403 //==============================================================
404
405 // Floating Point Registers
406
407 FR_Arg = f8
408 FR_Res = f8
409 FR_AX = f32
410 FR_XLog_Hi = f33
411 FR_XLog_Lo = f34
412
413 // Special logl registers
414 FR_Y_hi = f35
415 FR_Y_lo = f36
416
417 FR_Scale = f37
418 FR_X_Prime = f38
419 FR_S_hi = f39
420 FR_W = f40
421 FR_G = f41
422
423 FR_H = f42
424 FR_wsq = f43
425 FR_w4 = f44
426 FR_h = f45
427 FR_w6 = f46
428
429 FR_G2 = f47
430 FR_H2 = f48
431 FR_poly_lo = f49
432 FR_P8 = f50
433 FR_poly_hi = f51
434
435 FR_P7 = f52
436 FR_h2 = f53
437 FR_rsq = f54
438 FR_P6 = f55
439 FR_r = f56
440
441 FR_log2_hi = f57
442 FR_log2_lo = f58
443
444 FR_float_N = f59
445 FR_Q4 = f60
446
447 FR_G3 = f61
448 FR_H3 = f62
449 FR_h3 = f63
450
451 FR_Q3 = f64
452 FR_Q2 = f65
453 FR_1LN10_hi = f66
454
455 FR_Q1 = f67
456 FR_1LN10_lo = f68
457 FR_P5 = f69
458 FR_rcub = f70
459
460 FR_Neg_One = f71
461 FR_Z = f72
462 FR_AA = f73
463 FR_BB = f74
464 FR_S_lo = f75
465 FR_2_to_minus_N = f76
466
467
468 // Huge & Main path prolog registers
469 FR_Half = f77
470 FR_Two = f78
471 FR_X2 = f79
472 FR_P2 = f80
473 FR_P2L = f81
474 FR_Rcp = f82
475 FR_GG = f83
476 FR_HH = f84
477 FR_EE = f85
478 FR_DD = f86
479 FR_GL = f87
480 FR_A = f88
481 FR_AL = f89
482 FR_B = f90
483 FR_BL = f91
484 FR_Tmp = f92
485
486 // Near 0 & Huges path prolog registers
487 FR_C3 = f93
488 FR_C5 = f94
489 FR_C7 = f95
490 FR_C9 = f96
491
492 FR_X3 = f97
493 FR_X4 = f98
494 FR_P9 = f99
495 FR_P5 = f100
496 FR_P3 = f101
497
498
499 // General Purpose Registers
500
501 // General prolog registers
502 GR_PFS = r32
503 GR_TwoN7 = r40
504 GR_TwoP63 = r41
505 GR_ExpMask = r42
506 GR_ArgExp = r43
507 GR_Half = r44
508
509 // Near 0 path prolog registers
510 GR_Poly_C_35 = r45
511 GR_Poly_C_79 = r46
512
513 // Special logl registers
514 GR_Index1 = r34
515 GR_Index2 = r35
516 GR_signif = r36
517 GR_X_0 = r37
518 GR_X_1 = r38
519 GR_X_2 = r39
520 GR_Z_1 = r40
521 GR_Z_2 = r41
522 GR_N = r42
523 GR_Bias = r43
524 GR_M = r44
525 GR_Index3 = r45
526 GR_exp_2tom80 = r45
527 GR_exp_mask = r47
528 GR_exp_2tom7 = r48
529 GR_ad_ln10 = r49
530 GR_ad_tbl_1 = r50
531 GR_ad_tbl_2 = r51
532 GR_ad_tbl_3 = r52
533 GR_ad_q = r53
534 GR_ad_z_1 = r54
535 GR_ad_z_2 = r55
536 GR_ad_z_3 = r56
537 GR_minus_N = r57
538
539
540
541 .section .text
542 GLOBAL_LIBM_ENTRY(asinhl)
543
544 { .mfi
545 alloc GR_PFS = ar.pfs,0,27,0,0
546 fma.s1 FR_P2 = FR_Arg, FR_Arg, f1 // p2 = x^2 + 1
547 mov GR_Half = 0xfffe // 0.5's exp
548 }
549 { .mfi
550 addl GR_Poly_C_79 = @ltoff(Poly_C_near_0_79), gp // C7, C9 coeffs
551 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
552 addl GR_Poly_C_35 = @ltoff(Poly_C_near_0_35), gp // C3, C5 coeffs
553 };;
554
555 { .mfi
556 getf.exp GR_ArgExp = FR_Arg // get arument's exponent
557 fabs FR_AX = FR_Arg // absolute value of argument
558 mov GR_TwoN7 = 0xfff8 // 2^-7 exp
559 }
560 { .mfi
561 ld8 GR_Poly_C_79 = [GR_Poly_C_79] // get actual coeff table address
562 fma.s0 FR_Two = f1, f1, f1 // construct 2.0
563 mov GR_ExpMask = 0x1ffff // mask for exp
564 };;
565
566 { .mfi
567 ld8 GR_Poly_C_35 = [GR_Poly_C_35] // get actual coeff table address
568 fclass.m p6,p0 = FR_Arg, 0xe7 // if arg NaN inf zero
569 mov GR_TwoP63 = 0x1003e // 2^63 exp
570 }
571 { .mfi
572 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp
573 nop.f 0
574 nop.i 0
575 };;
576
577 { .mfi
578 setf.exp FR_Half = GR_Half // construct 0.5
579 fclass.m p7,p0 = FR_Arg, 0x09 // if arg + denorm
580 and GR_ArgExp = GR_ExpMask, GR_ArgExp // select exp
581 }
582 { .mfb
583 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
584 nop.f 0
585 nop.b 0
586 };;
587 { .mfi
588 ldfe FR_C9 = [GR_Poly_C_79],16 // load C9
589 fclass.m p10,p0 = FR_Arg, 0x0a // if arg - denorm
590 cmp.gt p8, p0 = GR_TwoN7, GR_ArgExp // if arg < 2^-7 ('near 0')
591 }
592 { .mfb
593 cmp.le p9, p0 = GR_TwoP63, GR_ArgExp // if arg > 2^63 ('huges')
594 (p6) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a
595 (p6) br.ret.spnt b0 // return
596 };;
597 // (X^2 + 1) computation
598 { .mfi
599 (p8) ldfe FR_C5 = [GR_Poly_C_35],16 // load C5
600 fms.s1 FR_Tmp = f1, f1, FR_P2 // Tmp = 1 - p2
601 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
602 }
603 { .mfb
604 (p8) ldfe FR_C7 = [GR_Poly_C_79],16 // load C7
605 (p7) fnma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a - a*a
606 (p7) br.ret.spnt b0 // return
607 };;
608
609 { .mfi
610 (p8) ldfe FR_C3 = [GR_Poly_C_35],16 // load C3
611 fcmp.lt.s1 p11, p12 = FR_Arg, f0 // if arg is negative
612 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
613 }
614 { .mfb
615 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
616 (p10) fma.s0 FR_Res = FR_Arg,FR_Arg,FR_Arg // r = a + a*a
617 (p10) br.ret.spnt b0 // return
618 };;
619
620 { .mfi
621 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
622 frsqrta.s1 FR_Rcp, p0 = FR_P2 // Rcp = 1/p2 reciprocal appr.
623 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
624 }
625 { .mfi
626 nop.m 0
627 fms.s1 FR_P2L = FR_AX, FR_AX, FR_X2 //low part of p2=fma(X*X-p2)
628 mov GR_Bias = 0x0FFFF // Create exponent bias
629 };;
630
631 { .mfb
632 nop.m 0
633 (p9) fms.s1 FR_XLog_Hi = FR_Two, FR_AX, f0 // Hi of log1p arg = 2*X - 1
634 (p9) br.cond.spnt huges_logl // special version of log1p
635 };;
636
637 { .mfb
638 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
639 (p8) fma.s1 FR_X3 = FR_X2, FR_Arg, f0 // x^3 = x^2 * x
640 (p8) br.cond.spnt near_0 // Go to near 0 branch
641 };;
642
643 { .mfi
644 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
645 nop.f 0
646 nop.i 0
647 };;
648
649 { .mfi
650 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
651 fma.s1 FR_Tmp = FR_Tmp, f1, FR_X2 // Tmp = Tmp + x^2
652 mov GR_exp_mask = 0x1FFFF // Create exponent mask
653 };;
654
655 { .mfi
656 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
657 fma.s1 FR_GG = FR_Rcp, FR_P2, f0 // g = Rcp * p2
658 // 8 bit Newton Raphson iteration
659 nop.i 0
660 }
661 { .mfi
662 nop.m 0
663 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
664 nop.i 0
665 };;
666 { .mfi
667 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
668 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
669 nop.i 0
670 }
671 { .mfi
672 nop.m 0
673 fma.s1 FR_P2L = FR_Tmp, f1, FR_P2L // low part of p2 = Tmp + p2l
674 nop.i 0
675 };;
676
677 { .mfi
678 ldfe FR_Q1 = [GR_ad_q] // Load Q1
679 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
680 // 16 bit Newton Raphson iteration
681 nop.i 0
682 }
683 { .mfi
684 nop.m 0
685 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
686 nop.i 0
687 };;
688
689 { .mfi
690 nop.m 0
691 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
692 nop.i 0
693 };;
694
695 { .mfi
696 nop.m 0
697 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
698 // 32 bit Newton Raphson iteration
699 nop.i 0
700 }
701 { .mfi
702 nop.m 0
703 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
704 nop.i 0
705 };;
706
707 { .mfi
708 nop.m 0
709 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
710 nop.i 0
711 };;
712
713 { .mfi
714 nop.m 0
715 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
716 // 64 bit Newton Raphson iteration
717 nop.i 0
718 }
719 { .mfi
720 nop.m 0
721 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
722 nop.i 0
723 };;
724
725 { .mfi
726 nop.m 0
727 fnma.s1 FR_DD = FR_GG, FR_GG, FR_P2 // Remainder d = g * g - p2
728 nop.i 0
729 }
730 { .mfi
731 nop.m 0
732 fma.s1 FR_XLog_Hi = FR_AX, f1, FR_GG // bh = z + gh
733 nop.i 0
734 };;
735
736 { .mfi
737 nop.m 0
738 fma.s1 FR_DD = FR_DD, f1, FR_P2L // add p2l: d = d + p2l
739 nop.i 0
740 };;
741
742
743 { .mfi
744 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
745 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
746 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
747 };;
748
749 { .mfi
750 nop.m 0
751 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
752 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
753 }
754 { .mfi
755 nop.m 0
756 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
757 nop.i 0
758 };;
759
760 { .mmi
761 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
762 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
763 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
764 };;
765
766 { .mmi
767 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
768 nop.m 0
769 nop.i 0
770 };;
771
772 { .mmi
773 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
774 nop.m 0
775 nop.i 0
776 };;
777
778 { .mfi
779 nop.m 0
780 fms.s1 FR_XLog_Lo = FR_GG, f1, FR_XLog_Hi // bl = gh - bh
781 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
782 };;
783
784 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
785 // "DEAD" ZONE!
786
787 { .mfi
788 nop.m 0
789 nop.f 0
790 nop.i 0
791 };;
792
793 { .mfi
794 nop.m 0
795 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
796 nop.i 0
797 };;
798
799 { .mmi
800 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
801 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
802 nop.i 0
803 };;
804
805 { .mfi
806 nop.m 0
807 nop.f 0
808 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
809 };;
810
811
812 { .mfi
813 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
814 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_AX // bl = bl + x
815 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
816 }
817 { .mfi
818 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
819 nop.f 0
820 sub GR_N = GR_N, GR_Bias // sub bias from exp
821 };;
822
823 { .mmi
824 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
825 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
826 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
827 };;
828
829 { .mmi
830 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
831 nop.m 0
832 nop.i 0
833 };;
834
835 { .mmi
836 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
837 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
838 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
839 };;
840
841 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
842 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
843 // So we can negate Q coefficients there for negative values
844
845 { .mfi
846 nop.m 0
847 (p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
848 nop.i 0
849 }
850 { .mfi
851 nop.m 0
852 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
853 nop.i 0
854 };;
855
856 { .mfi
857 nop.m 0
858 (p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
859 nop.i 0
860 };;
861
862 { .mfi
863 nop.m 0
864 (p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
865 nop.i 0
866 };;
867
868 { .mfi
869 nop.m 0
870 (p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
871 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
872 };;
873
874 { .mfi
875 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
876 nop.f 0
877 nop.i 0
878 };;
879
880 { .mfi
881 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
882 nop.f 0
883 nop.i 0
884 };;
885
886 { .mfi
887 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
888 fcvt.xf FR_float_N = FR_float_N
889 nop.i 0
890 };;
891
892 { .mfi
893 nop.m 0
894 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
895 nop.i 0
896 }
897 { .mfi
898 nop.m 0
899 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
900 nop.i 0
901 };;
902
903 { .mfi
904 nop.m 0
905 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
906 nop.i 0
907 }
908 { .mfi
909 nop.m 0
910 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^-N
911 nop.i 0
912 };;
913
914 { .mfi
915 nop.m 0
916 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
917 nop.i 0
918 }
919 { .mfi
920 nop.m 0
921 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
922 nop.i 0
923 };;
924
925 { .mfi
926 nop.m 0
927 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
928 nop.i 0
929 };;
930
931 { .mfi
932 nop.m 0
933 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
934 nop.i 0
935 }
936 { .mfi
937 nop.m 0
938 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
939 nop.i 0
940 };;
941
942 { .mfi
943 nop.m 0
944 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
945 nop.i 0
946 }
947 { .mfi
948 nop.m 0
949 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
950 nop.i 0
951 };;
952
953 { .mfi
954 nop.m 0
955 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
956 nop.i 0
957 }
958 { .mfi
959 nop.m 0
960 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
961 nop.i 0
962 };;
963
964 { .mfi
965 nop.m 0
966 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
967 nop.i 0
968 }
969 { .mfi
970 nop.m 0
971 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
972 nop.i 0
973 };;
974
975 .pred.rel "mutex",p12,p11
976 { .mfi
977 nop.m 0
978 (p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
979 nop.i 0
980 }
981 { .mfi
982 nop.m 0
983 (p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
984 nop.i 0
985 };;
986
987
988 .pred.rel "mutex",p12,p11
989 { .mfi
990 nop.m 0
991 (p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
992 nop.i 0
993 }
994 { .mfi
995 nop.m 0
996 (p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
997 nop.i 0
998 }
999 ;;
1000
1001 { .mfi
1002 nop.m 0
1003 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
1004 // Y_lo=poly_hi+poly_lo
1005 nop.i 0
1006 }
1007 { .mfi
1008 nop.m 0
1009 (p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1010 nop.i 0
1011 };;
1012
1013 { .mfb
1014 nop.m 0
1015 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1016 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1017 };;
1018
1019 // * SPECIAL VERSION OF LOGL FOR HUGE ARGUMENTS *
1020
1021 huges_logl:
1022 { .mfi
1023 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
1024 fmerge.ns FR_Neg_One = f1, f1 // Form -1.0
1025 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
1026 };;
1027
1028 { .mfi
1029 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
1030 nop.f 0
1031 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
1032 }
1033 { .mfi
1034 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
1035 nop.f 0
1036 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
1037 };;
1038
1039 { .mfi
1040 nop.m 0
1041 nop.f 0
1042 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
1043 }
1044 { .mfi
1045 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
1046 nop.f 0
1047 nop.i 0
1048 };;
1049
1050 { .mfi
1051 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
1052 nop.f 0
1053 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
1054 };;
1055
1056 { .mfi
1057 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
1058 nop.f 0
1059 mov GR_exp_mask = 0x1FFFF // Create exponent mask
1060 }
1061 { .mfi
1062 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1063 nop.f 0
1064 mov GR_Bias = 0x0FFFF // Create exponent bias
1065 };;
1066
1067 { .mfi
1068 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
1069 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
1070 nop.i 0
1071 };;
1072
1073 { .mmi
1074 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
1075 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
1076 nop.i 0
1077 };;
1078
1079 { .mfi
1080 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
1081 nop.f 0
1082 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
1083 };;
1084
1085 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1086 // "DEAD" ZONE!
1087
1088 { .mmi
1089 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
1090 sub GR_N = GR_N, GR_Bias
1091 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
1092 };;
1093
1094 { .mfi
1095 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
1096 nop.f 0
1097 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
1098 };;
1099
1100 { .mmf
1101 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
1102 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
1103 nop.f 0
1104 };;
1105
1106 { .mmi
1107 nop.m 0
1108 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
1109 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
1110 };;
1111
1112 { .mmi
1113 ldfe FR_Q1 = [GR_ad_q] // Load Q1
1114 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
1115 nop.i 0
1116 };;
1117
1118 { .mmi
1119 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
1120 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
1121 nop.i 0
1122 };;
1123
1124 { .mmi
1125 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
1126 nop.m 0
1127 nop.i 0
1128 };;
1129
1130 { .mfi
1131 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
1132 nop.f 0
1133 nop.i 0
1134 }
1135 { .mfi
1136 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
1137 nop.f 0
1138 nop.i 0
1139 };;
1140
1141 { .mfi
1142 nop.m 0
1143 nop.f 0
1144 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
1145 };;
1146
1147 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1148 // "DEAD" ZONE!
1149 // JUST HAVE TO INSERT 3 NOP CYCLES (nothing to do here)
1150
1151 { .mfi
1152 nop.m 0
1153 nop.f 0
1154 nop.i 0
1155 };;
1156
1157 { .mfi
1158 nop.m 0
1159 nop.f 0
1160 nop.i 0
1161 };;
1162
1163 { .mfi
1164 nop.m 0
1165 nop.f 0
1166 nop.i 0
1167 };;
1168
1169 { .mfi
1170 nop.m 0
1171 (p11) fma.s1 FR_Q4 = FR_Q4, FR_Neg_One, f0 // Negate Q4
1172 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
1173 };;
1174
1175 { .mfi
1176 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
1177 fcvt.xf FR_float_N = FR_float_N
1178 nop.i 0
1179 }
1180 { .mfi
1181 nop.m 0
1182 (p11) fma.s1 FR_Q3 = FR_Q3, FR_Neg_One, f0 // Negate Q3
1183 nop.i 0
1184 };;
1185
1186 { .mfi
1187 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
1188 (p11) fma.s1 FR_Q2 = FR_Q2, FR_Neg_One, f0 // Negate Q2
1189 nop.i 0
1190 }
1191 { .mfi
1192 nop.m 0
1193 (p11) fma.s1 FR_Q1 = FR_Q1, FR_Neg_One, f0 // Negate Q1
1194 nop.i 0
1195 };;
1196
1197 { .mfi
1198 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
1199 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1200 nop.i 0
1201 }
1202 { .mfi
1203 nop.m 0
1204 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1205 nop.i 0
1206 };;
1207
1208 { .mmf
1209 nop.m 0
1210 nop.m 0
1211 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1212 };;
1213
1214 { .mfi
1215 nop.m 0
1216 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
1217 nop.i 0
1218 }
1219 { .mfi
1220 nop.m 0
1221 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
1222 nop.i 0
1223 };;
1224
1225 { .mfi
1226 nop.m 0
1227 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1228 nop.i 0
1229 };;
1230
1231 { .mfi
1232 nop.m 0
1233 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1234 nop.i 0
1235 }
1236 { .mfi
1237 nop.m 0
1238 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1239 nop.i 0
1240 };;
1241
1242 { .mfi
1243 nop.m 0
1244 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
1245 nop.i 0
1246 };;
1247
1248 { .mfi
1249 nop.m 0
1250 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1251 nop.i 0
1252 }
1253 { .mfi
1254 nop.m 0
1255 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1256 nop.i 0
1257 };;
1258
1259 { .mfi
1260 nop.m 0
1261 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1262 nop.i 0
1263 }
1264 { .mfi
1265 nop.m 0
1266 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1267 nop.i 0
1268 };;
1269
1270 .pred.rel "mutex",p12,p11
1271 { .mfi
1272 nop.m 0
1273 (p12) fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1274 nop.i 0
1275 }
1276 { .mfi
1277 nop.m 0
1278 (p11) fms.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1279 nop.i 0
1280 };;
1281
1282
1283 .pred.rel "mutex",p12,p11
1284 { .mfi
1285 nop.m 0
1286 (p12) fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1287 nop.i 0
1288 }
1289 { .mfi
1290 nop.m 0
1291 (p11) fms.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1292 nop.i 0
1293 };;
1294
1295 { .mfi
1296 nop.m 0
1297 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
1298 nop.i 0
1299 }
1300 { .mfi
1301 nop.m 0
1302 (p11) fma.s0 FR_Y_hi = FR_Y_hi, FR_Neg_One, f0 // FR_Y_hi sign for neg
1303 nop.i 0
1304 };;
1305
1306 { .mfb
1307 nop.m 0
1308 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1309 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1310 };;
1311
1312 // NEAR ZERO POLYNOMIAL INTERVAL
1313 near_0:
1314 { .mfi
1315 nop.m 0
1316 fma.s1 FR_X4 = FR_X2, FR_X2, f0 // x^4 = x^2 * x^2
1317 nop.i 0
1318 };;
1319
1320 { .mfi
1321 nop.m 0
1322 fma.s1 FR_P9 = FR_C9,FR_X2,FR_C7 // p9 = C9*x^2 + C7
1323 nop.i 0
1324 }
1325 { .mfi
1326 nop.m 0
1327 fma.s1 FR_P5 = FR_C5,FR_X2,FR_C3 // p5 = C5*x^2 + C3
1328 nop.i 0
1329 };;
1330
1331 { .mfi
1332 nop.m 0
1333 fma.s1 FR_P3 = FR_P9,FR_X4,FR_P5 // p3 = p9*x^4 + p5
1334 nop.i 0
1335 };;
1336
1337 { .mfb
1338 nop.m 0
1339 fma.s0 FR_Res = FR_P3,FR_X3,FR_Arg // res = p3*C3 + x
1340 br.ret.sptk b0 // Near 0 path return
1341 };;
1342
1343 GLOBAL_LIBM_END(asinhl)
1344 libm_alias_ldouble_other (asinh, asinh)