initial commit
[glibc.git] / sysdeps / ia64 / fpu / e_acoshl.S
1 .file "acoshl.s"
2
3
4 // Copyright (c) 2000 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 //
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
18 //
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
22
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
28 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 // Intel Corporation is the author of this code, and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
38 //
39 //*********************************************************************
40 //
41 // History:
42 // 10/01/01 Initial version
43 // 10/10/01 Performance inproved
44 // 12/11/01 Changed huges_logp to not be global
45 // 01/02/02 Corrected .restore syntax
46 // 05/20/02 Cleaned up namespace and sf0 syntax
47 // 08/14/02 Changed mli templates to mlx
48 // 02/06/03 Reorganized data tables
49 // 03/31/05 Reformatted delimiters between data tables
50 //
51 //*********************************************************************
52 //
53 // API
54 //==============================================================
55 // long double acoshl(long double);
56 //
57 // Overview of operation
58 //==============================================================
59 //
60 // There are 6 paths:
61 // 1. x = 1
62 // Return acoshl(x) = 0;
63 //
64 // 2. x < 1
65 // Return acoshl(x) = Nan (Domain error, error handler call with tag 135);
66 //
67 // 3. x = [S,Q]Nan or +INF
68 // Return acoshl(x) = x + x;
69 //
70 // 4. 'Near 1': 1 < x < 1+1/8
71 // Return acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
72 // where y = 1, P(y)/Q(y) - rational approximation
73 //
74 // 5. 'Huges': x > 0.5*2^64
75 // Return acoshl(x) = (logl(2*x-1));
76 //
77 // 6. 'Main path': 1+1/8 < x < 0.5*2^64
78 // b_hi + b_lo = x + sqrt(x^2 - 1);
79 // acoshl(x) = logl_special(b_hi, b_lo);
80 //
81 // Algorithm description
82 //==============================================================
83 //
84 // I. Near 1 path algorithm
85 // **************************************************************
86 // The formula is acoshl(x) = sqrtl(2*y)*(1-P(y)/Q(y)),
87 // where y = 1, P(y)/Q(y) - rational approximation
88 //
89 // 1) y = x - 1, y2 = 2 * y
90 //
91 // 2) Compute in parallel sqrtl(2*y) and P(y)/Q(y)
92 // a) sqrtl computation method described below (main path algorithm, item 2))
93 // As result we obtain (gg+gl) - multiprecision result
94 // as pair of double extended values
95 // b) P(y) and Q(y) calculated without any extra precision manipulations
96 // c) P/Q division:
97 // y = frcpa(Q) initial approximation of 1/Q
98 // z = P*y initial approximation of P/Q
99 //
100 // e = 1 - b*y
101 // e2 = e + e^2
102 // e1 = e^2
103 // y1 = y + y*e2 = y + y*(e+e^2)
104 //
105 // e3 = e + e1^2
106 // y2 = y + y1*e3 = y + y*(e+e^2+..+e^6)
107 //
108 // r = P - Q*z
109 // e = 1 - Q*y2
110 // xx = z + r*y2 high part of a/b
111 //
112 // y3 = y2 + y2*e4
113 // r1 = P - Q*xx
114 // xl = r1*y3 low part of a/b
115 //
116 // 3) res = sqrt(2*y) - sqrt(2*y)*(P(y)/Q(y)) =
117 // = (gg+gl) - (gg + gl)*(xx+xl);
118 //
119 // a) hh = gg*xx; hl = gg*xl; lh = gl*xx; ll = gl*xl;
120 // b) res = ((((gl + ll) + lh) + hl) + hh) + gg;
121 // (exactly in this order)
122 //
123 // II. Main path algorithm
124 // ( thanks to Peter Markstein for the idea of sqrt(x^2+1) computation! )
125 // **********************************************************************
126 //
127 // There are 3 parts of x+sqrt(x^2-1) computation:
128 //
129 // 1) m2 = (m2_hi+m2_lo) = x^2-1 obtaining
130 // ------------------------------------
131 // m2_hi = x2_hi - 1, where x2_hi = x * x;
132 // m2_lo = x2_lo + p1_lo, where
133 // x2_lo = FMS(x*x-x2_hi),
134 // p1_lo = (1 + m2_hi) - x2_hi;
135 //
136 // 2) g = (g_hi+g_lo) = sqrt(m2) = sqrt(m2_hi+m2_lo)
137 // ----------------------------------------------
138 // r = invsqrt(m2_hi) (8-bit reciprocal square root approximation);
139 // g = m2_hi * r (first 8 bit-approximation of sqrt);
140 //
141 // h = 0.5 * r;
142 // e = 0.5 - g * h;
143 // g = g * e + g (second 16 bit-approximation of sqrt);
144 //
145 // h = h * e + h;
146 // e = 0.5 - g * h;
147 // g = g * e + g (third 32 bit-approximation of sqrt);
148 //
149 // h = h * e + h;
150 // e = 0.5 - g * h;
151 // g_hi = g * e + g (fourth 64 bit-approximation of sqrt);
152 //
153 // Remainder computation:
154 // h = h * e + h;
155 // d = (m2_hi - g_hi * g_hi) + m2_lo;
156 // g_lo = d * h;
157 //
158 // 3) b = (b_hi + b_lo) = x + g, where g = (g_hi + g_lo) = sqrt(x^2-1)
159 // -------------------------------------------------------------------
160 // b_hi = (g_hi + x) + gl;
161 // b_lo = (x - b_hi) + g_hi + gl;
162 //
163 // Now we pass b presented as sum b_hi + b_lo to special version
164 // of logl function which accept a pair of arguments as
165 // mutiprecision value.
166 //
167 // Special log algorithm overview
168 // ================================
169 // Here we use a table lookup method. The basic idea is that in
170 // order to compute logl(Arg) for an argument Arg in [1,2),
171 // we construct a value G such that G*Arg is close to 1 and that
172 // logl(1/G) is obtainable easily from a table of values calculated
173 // beforehand. Thus
174 //
175 // logl(Arg) = logl(1/G) + logl((G*Arg - 1))
176 //
177 // Because |G*Arg - 1| is small, the second term on the right hand
178 // side can be approximated by a short polynomial. We elaborate
179 // this method in four steps.
180 //
181 // Step 0: Initialization
182 //
183 // We need to calculate logl( X+1 ). Obtain N, S_hi such that
184 //
185 // X = 2^N * ( S_hi + S_lo ) exactly
186 //
187 // where S_hi in [1,2) and S_lo is a correction to S_hi in the sense
188 // that |S_lo| <= ulp(S_hi).
189 //
190 // For the special version of logl: S_lo = b_lo
191 // !-----------------------------------------------!
192 //
193 // Step 1: Argument Reduction
194 //
195 // Based on S_hi, obtain G_1, G_2, G_3 from a table and calculate
196 //
197 // G := G_1 * G_2 * G_3
198 // r := (G * S_hi - 1) + G * S_lo
199 //
200 // These G_j's have the property that the product is exactly
201 // representable and that |r| < 2^(-12) as a result.
202 //
203 // Step 2: Approximation
204 //
205 // logl(1 + r) is approximated by a short polynomial poly(r).
206 //
207 // Step 3: Reconstruction
208 //
209 // Finally, logl( X ) = logl( X+1 ) is given by
210 //
211 // logl( X ) = logl( 2^N * (S_hi + S_lo) )
212 // ~=~ N*logl(2) + logl(1/G) + logl(1 + r)
213 // ~=~ N*logl(2) + logl(1/G) + poly(r).
214 //
215 // For detailed description see logl or log1pl function, regular path.
216 //
217 // Registers used
218 //==============================================================
219 // Floating Point registers used:
220 // f8, input
221 // f32 -> f95 (64 registers)
222
223 // General registers used:
224 // r32 -> r67 (36 registers)
225
226 // Predicate registers used:
227 // p7 -> p11
228 // p7 for 'NaNs, Inf' path
229 // p8 for 'near 1' path
230 // p9 for 'huges' path
231 // p10 for x = 1
232 // p11 for x < 1
233 //
234 //*********************************************************************
235 // IEEE Special Conditions:
236 //
237 // acoshl(+inf) = +inf
238 // acoshl(-inf) = QNaN
239 // acoshl(1) = 0
240 // acoshl(x<1) = QNaN
241 // acoshl(SNaN) = QNaN
242 // acoshl(QNaN) = QNaN
243 //
244
245 // Data tables
246 //==============================================================
247
248 RODATA
249 .align 64
250
251 // Near 1 path rational approximation coefficients
252 LOCAL_OBJECT_START(Poly_P)
253 data8 0xB0978143F695D40F, 0x3FF1 // .84205539791447100108478906277453574946e-4
254 data8 0xB9800D841A8CAD29, 0x3FF6 // .28305085180397409672905983082168721069e-2
255 data8 0xC889F455758C1725, 0x3FF9 // .24479844297887530847660233111267222945e-1
256 data8 0x9BE1DFF006F45F12, 0x3FFB // .76114415657565879842941751209926938306e-1
257 data8 0x9E34AF4D372861E0, 0x3FFB // .77248925727776366270605984806795850504e-1
258 data8 0xF3DC502AEE14C4AE, 0x3FA6 // .3077953476682583606615438814166025592e-26
259 LOCAL_OBJECT_END(Poly_P)
260
261 //
262 LOCAL_OBJECT_START(Poly_Q)
263 data8 0xF76E3FD3C7680357, 0x3FF1 // .11798413344703621030038719253730708525e-3
264 data8 0xD107D2E7273263AE, 0x3FF7 // .63791065024872525660782716786703188820e-2
265 data8 0xB609BE5CDE206AEF, 0x3FFB // .88885771950814004376363335821980079985e-1
266 data8 0xF7DEACAC28067C8A, 0x3FFD // .48412074662702495416825113623936037072302
267 data8 0x8F9BE5890CEC7E38, 0x3FFF // 1.1219450873557867470217771071068369729526
268 data8 0xED4F06F3D2BC92D1, 0x3FFE // .92698710873331639524734537734804056798748
269 LOCAL_OBJECT_END(Poly_Q)
270
271 // Q coeffs
272 LOCAL_OBJECT_START(Constants_Q)
273 data4 0x00000000,0xB1721800,0x00003FFE,0x00000000
274 data4 0x4361C4C6,0x82E30865,0x0000BFE2,0x00000000
275 data4 0x328833CB,0xCCCCCAF2,0x00003FFC,0x00000000
276 data4 0xA9D4BAFB,0x80000077,0x0000BFFD,0x00000000
277 data4 0xAAABE3D2,0xAAAAAAAA,0x00003FFD,0x00000000
278 data4 0xFFFFDAB7,0xFFFFFFFF,0x0000BFFD,0x00000000
279 LOCAL_OBJECT_END(Constants_Q)
280
281 // Z1 - 16 bit fixed
282 LOCAL_OBJECT_START(Constants_Z_1)
283 data4 0x00008000
284 data4 0x00007879
285 data4 0x000071C8
286 data4 0x00006BCB
287 data4 0x00006667
288 data4 0x00006187
289 data4 0x00005D18
290 data4 0x0000590C
291 data4 0x00005556
292 data4 0x000051EC
293 data4 0x00004EC5
294 data4 0x00004BDB
295 data4 0x00004925
296 data4 0x0000469F
297 data4 0x00004445
298 data4 0x00004211
299 LOCAL_OBJECT_END(Constants_Z_1)
300
301 // G1 and H1 - IEEE single and h1 - IEEE double
302 LOCAL_OBJECT_START(Constants_G_H_h1)
303 data4 0x3F800000,0x00000000
304 data8 0x0000000000000000
305 data4 0x3F70F0F0,0x3D785196
306 data8 0x3DA163A6617D741C
307 data4 0x3F638E38,0x3DF13843
308 data8 0x3E2C55E6CBD3D5BB
309 data4 0x3F579430,0x3E2FF9A0
310 data8 0xBE3EB0BFD86EA5E7
311 data4 0x3F4CCCC8,0x3E647FD6
312 data8 0x3E2E6A8C86B12760
313 data4 0x3F430C30,0x3E8B3AE7
314 data8 0x3E47574C5C0739BA
315 data4 0x3F3A2E88,0x3EA30C68
316 data8 0x3E20E30F13E8AF2F
317 data4 0x3F321640,0x3EB9CEC8
318 data8 0xBE42885BF2C630BD
319 data4 0x3F2AAAA8,0x3ECF9927
320 data8 0x3E497F3497E577C6
321 data4 0x3F23D708,0x3EE47FC5
322 data8 0x3E3E6A6EA6B0A5AB
323 data4 0x3F1D89D8,0x3EF8947D
324 data8 0xBDF43E3CD328D9BE
325 data4 0x3F17B420,0x3F05F3A1
326 data8 0x3E4094C30ADB090A
327 data4 0x3F124920,0x3F0F4303
328 data8 0xBE28FBB2FC1FE510
329 data4 0x3F0D3DC8,0x3F183EBF
330 data8 0x3E3A789510FDE3FA
331 data4 0x3F088888,0x3F20EC80
332 data8 0x3E508CE57CC8C98F
333 data4 0x3F042108,0x3F29516A
334 data8 0xBE534874A223106C
335 LOCAL_OBJECT_END(Constants_G_H_h1)
336
337 // Z2 - 16 bit fixed
338 LOCAL_OBJECT_START(Constants_Z_2)
339 data4 0x00008000
340 data4 0x00007F81
341 data4 0x00007F02
342 data4 0x00007E85
343 data4 0x00007E08
344 data4 0x00007D8D
345 data4 0x00007D12
346 data4 0x00007C98
347 data4 0x00007C20
348 data4 0x00007BA8
349 data4 0x00007B31
350 data4 0x00007ABB
351 data4 0x00007A45
352 data4 0x000079D1
353 data4 0x0000795D
354 data4 0x000078EB
355 LOCAL_OBJECT_END(Constants_Z_2)
356
357 // G2 and H2 - IEEE single and h2 - IEEE double
358 LOCAL_OBJECT_START(Constants_G_H_h2)
359 data4 0x3F800000,0x00000000
360 data8 0x0000000000000000
361 data4 0x3F7F00F8,0x3B7F875D
362 data8 0x3DB5A11622C42273
363 data4 0x3F7E03F8,0x3BFF015B
364 data8 0x3DE620CF21F86ED3
365 data4 0x3F7D08E0,0x3C3EE393
366 data8 0xBDAFA07E484F34ED
367 data4 0x3F7C0FC0,0x3C7E0586
368 data8 0xBDFE07F03860BCF6
369 data4 0x3F7B1880,0x3C9E75D2
370 data8 0x3DEA370FA78093D6
371 data4 0x3F7A2328,0x3CBDC97A
372 data8 0x3DFF579172A753D0
373 data4 0x3F792FB0,0x3CDCFE47
374 data8 0x3DFEBE6CA7EF896B
375 data4 0x3F783E08,0x3CFC15D0
376 data8 0x3E0CF156409ECB43
377 data4 0x3F774E38,0x3D0D874D
378 data8 0xBE0B6F97FFEF71DF
379 data4 0x3F766038,0x3D1CF49B
380 data8 0xBE0804835D59EEE8
381 data4 0x3F757400,0x3D2C531D
382 data8 0x3E1F91E9A9192A74
383 data4 0x3F748988,0x3D3BA322
384 data8 0xBE139A06BF72A8CD
385 data4 0x3F73A0D0,0x3D4AE46F
386 data8 0x3E1D9202F8FBA6CF
387 data4 0x3F72B9D0,0x3D5A1756
388 data8 0xBE1DCCC4BA796223
389 data4 0x3F71D488,0x3D693B9D
390 data8 0xBE049391B6B7C239
391 LOCAL_OBJECT_END(Constants_G_H_h2)
392
393 // G3 and H3 - IEEE single and h3 - IEEE double
394 LOCAL_OBJECT_START(Constants_G_H_h3)
395 data4 0x3F7FFC00,0x38800100
396 data8 0x3D355595562224CD
397 data4 0x3F7FF400,0x39400480
398 data8 0x3D8200A206136FF6
399 data4 0x3F7FEC00,0x39A00640
400 data8 0x3DA4D68DE8DE9AF0
401 data4 0x3F7FE400,0x39E00C41
402 data8 0xBD8B4291B10238DC
403 data4 0x3F7FDC00,0x3A100A21
404 data8 0xBD89CCB83B1952CA
405 data4 0x3F7FD400,0x3A300F22
406 data8 0xBDB107071DC46826
407 data4 0x3F7FCC08,0x3A4FF51C
408 data8 0x3DB6FCB9F43307DB
409 data4 0x3F7FC408,0x3A6FFC1D
410 data8 0xBD9B7C4762DC7872
411 data4 0x3F7FBC10,0x3A87F20B
412 data8 0xBDC3725E3F89154A
413 data4 0x3F7FB410,0x3A97F68B
414 data8 0xBD93519D62B9D392
415 data4 0x3F7FAC18,0x3AA7EB86
416 data8 0x3DC184410F21BD9D
417 data4 0x3F7FA420,0x3AB7E101
418 data8 0xBDA64B952245E0A6
419 data4 0x3F7F9C20,0x3AC7E701
420 data8 0x3DB4B0ECAABB34B8
421 data4 0x3F7F9428,0x3AD7DD7B
422 data8 0x3D9923376DC40A7E
423 data4 0x3F7F8C30,0x3AE7D474
424 data8 0x3DC6E17B4F2083D3
425 data4 0x3F7F8438,0x3AF7CBED
426 data8 0x3DAE314B811D4394
427 data4 0x3F7F7C40,0x3B03E1F3
428 data8 0xBDD46F21B08F2DB1
429 data4 0x3F7F7448,0x3B0BDE2F
430 data8 0xBDDC30A46D34522B
431 data4 0x3F7F6C50,0x3B13DAAA
432 data8 0x3DCB0070B1F473DB
433 data4 0x3F7F6458,0x3B1BD766
434 data8 0xBDD65DDC6AD282FD
435 data4 0x3F7F5C68,0x3B23CC5C
436 data8 0xBDCDAB83F153761A
437 data4 0x3F7F5470,0x3B2BC997
438 data8 0xBDDADA40341D0F8F
439 data4 0x3F7F4C78,0x3B33C711
440 data8 0x3DCD1BD7EBC394E8
441 data4 0x3F7F4488,0x3B3BBCC6
442 data8 0xBDC3532B52E3E695
443 data4 0x3F7F3C90,0x3B43BAC0
444 data8 0xBDA3961EE846B3DE
445 data4 0x3F7F34A0,0x3B4BB0F4
446 data8 0xBDDADF06785778D4
447 data4 0x3F7F2CA8,0x3B53AF6D
448 data8 0x3DCC3ED1E55CE212
449 data4 0x3F7F24B8,0x3B5BA620
450 data8 0xBDBA31039E382C15
451 data4 0x3F7F1CC8,0x3B639D12
452 data8 0x3D635A0B5C5AF197
453 data4 0x3F7F14D8,0x3B6B9444
454 data8 0xBDDCCB1971D34EFC
455 data4 0x3F7F0CE0,0x3B7393BC
456 data8 0x3DC7450252CD7ADA
457 data4 0x3F7F04F0,0x3B7B8B6D
458 data8 0xBDB68F177D7F2A42
459 LOCAL_OBJECT_END(Constants_G_H_h3)
460
461 // Assembly macros
462 //==============================================================
463
464 // Floating Point Registers
465
466 FR_Arg = f8
467 FR_Res = f8
468
469
470 FR_PP0 = f32
471 FR_PP1 = f33
472 FR_PP2 = f34
473 FR_PP3 = f35
474 FR_PP4 = f36
475 FR_PP5 = f37
476 FR_QQ0 = f38
477 FR_QQ1 = f39
478 FR_QQ2 = f40
479 FR_QQ3 = f41
480 FR_QQ4 = f42
481 FR_QQ5 = f43
482
483 FR_Q1 = f44
484 FR_Q2 = f45
485 FR_Q3 = f46
486 FR_Q4 = f47
487
488 FR_Half = f48
489 FR_Two = f49
490
491 FR_log2_hi = f50
492 FR_log2_lo = f51
493
494
495 FR_X2 = f52
496 FR_M2 = f53
497 FR_M2L = f54
498 FR_Rcp = f55
499 FR_GG = f56
500 FR_HH = f57
501 FR_EE = f58
502 FR_DD = f59
503 FR_GL = f60
504 FR_Tmp = f61
505
506
507 FR_XM1 = f62
508 FR_2XM1 = f63
509 FR_XM12 = f64
510
511
512
513 // Special logl registers
514 FR_XLog_Hi = f65
515 FR_XLog_Lo = f66
516
517 FR_Y_hi = f67
518 FR_Y_lo = f68
519
520 FR_S_hi = f69
521 FR_S_lo = f70
522
523 FR_poly_lo = f71
524 FR_poly_hi = f72
525
526 FR_G = f73
527 FR_H = f74
528 FR_h = f75
529
530 FR_G2 = f76
531 FR_H2 = f77
532 FR_h2 = f78
533
534 FR_r = f79
535 FR_rsq = f80
536 FR_rcub = f81
537
538 FR_float_N = f82
539
540 FR_G3 = f83
541 FR_H3 = f84
542 FR_h3 = f85
543
544 FR_2_to_minus_N = f86
545
546
547 // Near 1 registers
548 FR_PP = f65
549 FR_QQ = f66
550
551
552 FR_PV6 = f69
553 FR_PV4 = f70
554 FR_PV3 = f71
555 FR_PV2 = f72
556
557 FR_QV6 = f73
558 FR_QV4 = f74
559 FR_QV3 = f75
560 FR_QV2 = f76
561
562 FR_Y0 = f77
563 FR_Q0 = f78
564 FR_E0 = f79
565 FR_E2 = f80
566 FR_E1 = f81
567 FR_Y1 = f82
568 FR_E3 = f83
569 FR_Y2 = f84
570 FR_R0 = f85
571 FR_E4 = f86
572 FR_Y3 = f87
573 FR_R1 = f88
574 FR_X_Hi = f89
575 FR_X_lo = f90
576
577 FR_HH = f91
578 FR_LL = f92
579 FR_HL = f93
580 FR_LH = f94
581
582
583
584 // Error handler registers
585 FR_Arg_X = f95
586 FR_Arg_Y = f0
587
588
589 // General Purpose Registers
590
591 // General prolog registers
592 GR_PFS = r32
593 GR_OneP125 = r33
594 GR_TwoP63 = r34
595 GR_Arg = r35
596 GR_Half = r36
597
598 // Near 1 path registers
599 GR_Poly_P = r37
600 GR_Poly_Q = r38
601
602 // Special logl registers
603 GR_Index1 = r39
604 GR_Index2 = r40
605 GR_signif = r41
606 GR_X_0 = r42
607 GR_X_1 = r43
608 GR_X_2 = r44
609 GR_minus_N = r45
610 GR_Z_1 = r46
611 GR_Z_2 = r47
612 GR_N = r48
613 GR_Bias = r49
614 GR_M = r50
615 GR_Index3 = r51
616 GR_exp_2tom80 = r52
617 GR_exp_mask = r53
618 GR_exp_2tom7 = r54
619 GR_ad_ln10 = r55
620 GR_ad_tbl_1 = r56
621 GR_ad_tbl_2 = r57
622 GR_ad_tbl_3 = r58
623 GR_ad_q = r59
624 GR_ad_z_1 = r60
625 GR_ad_z_2 = r61
626 GR_ad_z_3 = r62
627
628 //
629 // Added for unwind support
630 //
631 GR_SAVE_PFS = r32
632 GR_SAVE_B0 = r33
633 GR_SAVE_GP = r34
634
635 GR_Parameter_X = r64
636 GR_Parameter_Y = r65
637 GR_Parameter_RESULT = r66
638 GR_Parameter_TAG = r67
639
640
641
642 .section .text
643 GLOBAL_LIBM_ENTRY(acoshl)
644
645 { .mfi
646 alloc GR_PFS = ar.pfs,0,32,4,0 // Local frame allocation
647 fcmp.lt.s1 p11, p0 = FR_Arg, f1 // if arg is less than 1
648 mov GR_Half = 0xfffe // 0.5's exp
649 }
650 { .mfi
651 addl GR_Poly_Q = @ltoff(Poly_Q), gp // Address of Q-coeff table
652 fma.s1 FR_X2 = FR_Arg, FR_Arg, f0 // Obtain x^2
653 addl GR_Poly_P = @ltoff(Poly_P), gp // Address of P-coeff table
654 };;
655
656 { .mfi
657 getf.d GR_Arg = FR_Arg // get argument as double (int64)
658 fma.s0 FR_Two = f1, f1, f1 // construct 2.0
659 addl GR_ad_z_1 = @ltoff(Constants_Z_1#),gp // logl tables
660 }
661 { .mlx
662 nop.m 0
663 movl GR_TwoP63 = 0x43E8000000000000 // 0.5*2^63 (huge arguments)
664 };;
665
666 { .mfi
667 ld8 GR_Poly_P = [GR_Poly_P] // get actual P-coeff table address
668 fcmp.eq.s1 p10, p0 = FR_Arg, f1 // if arg == 1 (return 0)
669 nop.i 0
670 }
671 { .mlx
672 ld8 GR_Poly_Q = [GR_Poly_Q] // get actual Q-coeff table address
673 movl GR_OneP125 = 0x3FF2000000000000 // 1.125 (near 1 path bound)
674 };;
675
676 { .mfi
677 ld8 GR_ad_z_1 = [GR_ad_z_1] // Get pointer to Constants_Z_1
678 fclass.m p7,p0 = FR_Arg, 0xe3 // if arg NaN inf
679 cmp.le p9, p0 = GR_TwoP63, GR_Arg // if arg > 0.5*2^63 ('huges')
680 }
681 { .mfb
682 cmp.ge p8, p0 = GR_OneP125, GR_Arg // if arg<1.125 -near 1 path
683 fms.s1 FR_XM1 = FR_Arg, f1, f1 // X0 = X-1 (for near 1 path)
684 (p11) br.cond.spnt acoshl_lt_pone // error branch (less than 1)
685 };;
686
687 { .mmi
688 setf.exp FR_Half = GR_Half // construct 0.5
689 (p9) setf.s FR_XLog_Lo = r0 // Low of logl arg=0 (Huges path)
690 mov GR_exp_mask = 0x1FFFF // Create exponent mask
691 };;
692
693 { .mmf
694 (p8) ldfe FR_PP5 = [GR_Poly_P],16 // Load P5
695 (p8) ldfe FR_QQ5 = [GR_Poly_Q],16 // Load Q5
696 fms.s1 FR_M2 = FR_X2, f1, f1 // m2 = x^2 - 1
697 };;
698
699 { .mfi
700 (p8) ldfe FR_QQ4 = [GR_Poly_Q],16 // Load Q4
701 fms.s1 FR_M2L = FR_Arg, FR_Arg, FR_X2 // low part of
702 // m2 = fma(X*X - m2)
703 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
704 }
705 { .mfb
706 (p8) ldfe FR_PP4 = [GR_Poly_P],16 // Load P4
707 (p7) fma.s0 FR_Res = FR_Arg,f1,FR_Arg // r = a + a (Nan, Inf)
708 (p7) br.ret.spnt b0 // return (Nan, Inf)
709 };;
710
711 { .mfi
712 (p8) ldfe FR_PP3 = [GR_Poly_P],16 // Load P3
713 nop.f 0
714 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
715 }
716 { .mfb
717 (p8) ldfe FR_QQ3 = [GR_Poly_Q],16 // Load Q3
718 (p9) fms.s1 FR_XLog_Hi = FR_Two, FR_Arg, f1 // Hi of log arg = 2*X-1
719 (p9) br.cond.spnt huges_logl // special version of log
720 }
721 ;;
722
723 { .mfi
724 (p8) ldfe FR_PP2 = [GR_Poly_P],16 // Load P2
725 (p8) fma.s1 FR_2XM1 = FR_Two, FR_XM1, f0 // 2X0 = 2 * X0
726 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
727 }
728 { .mfb
729 (p8) ldfe FR_QQ2 = [GR_Poly_Q],16 // Load Q2
730 (p10) fma.s0 FR_Res = f0,f1,f0 // r = 0 (arg = 1)
731 (p10) br.ret.spnt b0 // return (arg = 1)
732 };;
733
734 { .mmi
735 (p8) ldfe FR_PP1 = [GR_Poly_P],16 // Load P1
736 (p8) ldfe FR_QQ1 = [GR_Poly_Q],16 // Load Q1
737 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
738 }
739 ;;
740
741 { .mfi
742 (p8) ldfe FR_PP0 = [GR_Poly_P] // Load P0
743 fma.s1 FR_Tmp = f1, f1, FR_M2 // Tmp = 1 + m2
744 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
745 }
746 { .mfb
747 (p8) ldfe FR_QQ0 = [GR_Poly_Q]
748 nop.f 0
749 (p8) br.cond.spnt near_1 // near 1 path
750 };;
751 { .mfi
752 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
753 nop.f 0
754 mov GR_Bias = 0x0FFFF // Create exponent bias
755 };;
756 { .mfi
757 nop.m 0
758 frsqrta.s1 FR_Rcp, p0 = FR_M2 // Rcp = 1/m2 reciprocal appr.
759 nop.i 0
760 };;
761
762 { .mfi
763 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
764 fms.s1 FR_Tmp = FR_X2, f1, FR_Tmp // Tmp = x^2 - Tmp
765 nop.i 0
766 };;
767
768 { .mfi
769 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
770 fma.s1 FR_GG = FR_Rcp, FR_M2, f0 // g = Rcp * m2
771 // 8 bit Newton Raphson iteration
772 nop.i 0
773 }
774 { .mfi
775 nop.m 0
776 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp
777 nop.i 0
778 };;
779 { .mfi
780 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
781 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
782 nop.i 0
783 }
784 { .mfi
785 nop.m 0
786 fma.s1 FR_M2L = FR_Tmp, f1, FR_M2L // low part of m2 = Tmp+m2l
787 nop.i 0
788 };;
789
790 { .mfi
791 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
792 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
793 // 16 bit Newton Raphson iteration
794 nop.i 0
795 }
796 { .mfi
797 nop.m 0
798 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
799 nop.i 0
800 };;
801
802 { .mfi
803 ldfe FR_Q1 = [GR_ad_q] // Load Q1
804 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
805 nop.i 0
806 };;
807 { .mfi
808 nop.m 0
809 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
810 // 32 bit Newton Raphson iteration
811 nop.i 0
812 }
813 { .mfi
814 nop.m 0
815 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
816 nop.i 0
817 };;
818
819 { .mfi
820 nop.m 0
821 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h
822 nop.i 0
823 };;
824
825 { .mfi
826 nop.m 0
827 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g
828 // 64 bit Newton Raphson iteration
829 nop.i 0
830 }
831 { .mfi
832 nop.m 0
833 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h
834 nop.i 0
835 };;
836
837 { .mfi
838 nop.m 0
839 fnma.s1 FR_DD = FR_GG, FR_GG, FR_M2 // Remainder d = g * g - p2
840 nop.i 0
841 }
842 { .mfi
843 nop.m 0
844 fma.s1 FR_XLog_Hi = FR_Arg, f1, FR_GG // bh = z + gh
845 nop.i 0
846 };;
847
848 { .mfi
849 nop.m 0
850 fma.s1 FR_DD = FR_DD, f1, FR_M2L // add p2l: d = d + p2l
851 nop.i 0
852 };;
853
854 { .mfi
855 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
856 nop.f 0
857 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
858 };;
859
860 { .mfi
861 nop.m 0
862 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h
863 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
864 }
865 { .mfi
866 nop.m 0
867 fma.s1 FR_XLog_Hi = FR_DD, FR_HH, FR_XLog_Hi // bh = bh + gl
868 nop.i 0
869 };;
870
871
872
873 { .mmi
874 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
875 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
876 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
877 };;
878
879 { .mmi
880 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
881 nop.m 0
882 nop.i 0
883 };;
884
885 { .mmi
886 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
887 nop.m 0
888 nop.i 0
889 };;
890
891 { .mfi
892 nop.m 0
893 fms.s1 FR_XLog_Lo = FR_Arg, f1, FR_XLog_Hi // bl = x - bh
894 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
895 };;
896
897 // WE CANNOT USE GR_X_1 IN NEXT 3 CYCLES BECAUSE OF POSSIBLE 10 CLOCKS STALL!
898 // "DEAD" ZONE!
899
900 { .mfi
901 nop.m 0
902 nop.f 0
903 nop.i 0
904 };;
905
906 { .mfi
907 nop.m 0
908 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x+1|
909 nop.i 0
910 };;
911
912
913 { .mmi
914 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
915 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
916 nop.i 0
917 };;
918
919 { .mfi
920 nop.m 0
921 nop.f 0
922 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
923 };;
924
925 { .mfi
926 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
927 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GG // bl = bl + gg
928 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
929 }
930 { .mfi
931 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
932 nop.f 0
933 sub GR_N = GR_N, GR_Bias // sub bias from exp
934 };;
935
936 { .mmi
937 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
938 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
939 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
940 };;
941
942 { .mmi
943 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
944 nop.m 0
945 nop.i 0
946 };;
947
948 { .mmi
949 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
950 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
951 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1 * Z_2
952 };;
953
954 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
955 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
956 // (Just nops added - nothing to do here)
957
958 { .mfi
959 nop.m 0
960 fma.s1 FR_XLog_Lo = FR_XLog_Lo, f1, FR_GL // bl = bl + gl
961 nop.i 0
962 };;
963 { .mfi
964 nop.m 0
965 nop.f 0
966 nop.i 0
967 };;
968 { .mfi
969 nop.m 0
970 nop.f 0
971 nop.i 0
972 };;
973
974 { .mfi
975 nop.m 0
976 nop.f 0
977 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
978 };;
979
980 { .mfi
981 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
982 nop.f 0
983 nop.i 0
984 };;
985
986 { .mfi
987 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
988 nop.f 0
989 nop.i 0
990 };;
991
992 { .mfi
993 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
994 fcvt.xf FR_float_N = FR_float_N
995 nop.i 0
996 };;
997
998 { .mfi
999 nop.m 0
1000 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1001 nop.i 0
1002 }
1003 { .mfi
1004 nop.m 0
1005 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1006 nop.i 0
1007 };;
1008
1009 { .mfi
1010 nop.m 0
1011 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1012 nop.i 0
1013 }
1014 { .mfi
1015 nop.m 0
1016 fma.s1 FR_S_lo = FR_XLog_Lo, FR_2_to_minus_N, f0 //S_lo=S_lo*2^(-N)
1017 nop.i 0
1018 };;
1019
1020 { .mfi
1021 nop.m 0
1022 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2) * G_3
1023 nop.i 0
1024 }
1025 { .mfi
1026 nop.m 0
1027 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2) + H_3
1028 nop.i 0
1029 };;
1030
1031 { .mfi
1032 nop.m 0
1033 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1034 nop.i 0
1035 };;
1036
1037 { .mfi
1038 nop.m 0
1039 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1040 nop.i 0
1041 }
1042 { .mfi
1043 nop.m 0
1044 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1045 nop.i 0
1046 };;
1047
1048 { .mfi
1049 nop.m 0
1050 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h=N*log2_lo+h
1051 nop.i 0
1052 }
1053 { .mfi
1054 nop.m 0
1055 fma.s1 FR_r = FR_G, FR_S_lo, FR_r // r=G*S_lo+(G*S_hi-1)
1056 nop.i 0
1057 };;
1058
1059 { .mfi
1060 nop.m 0
1061 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1062 nop.i 0
1063 }
1064 { .mfi
1065 nop.m 0
1066 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1067 nop.i 0
1068 };;
1069
1070 { .mfi
1071 nop.m 0
1072 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1073 nop.i 0
1074 }
1075 { .mfi
1076 nop.m 0
1077 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1078 nop.i 0
1079 };;
1080
1081 { .mfi
1082 nop.m 0
1083 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1084 nop.i 0
1085 };;
1086
1087 { .mfi
1088 nop.m 0
1089 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1090 nop.i 0
1091 };;
1092
1093 { .mfi
1094 nop.m 0
1095 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo
1096 // Y_lo=poly_hi+poly_lo
1097 nop.i 0
1098 };;
1099
1100 { .mfb
1101 nop.m 0
1102 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1103 br.ret.sptk b0 // Common exit for 2^-7 < x < inf
1104 };;
1105
1106
1107 huges_logl:
1108 { .mmi
1109 getf.sig GR_signif = FR_XLog_Hi // Get significand of x+1
1110 mov GR_exp_2tom7 = 0x0fff8 // Exponent of 2^-7
1111 nop.i 0
1112 };;
1113
1114 { .mfi
1115 add GR_ad_tbl_1 = 0x040, GR_ad_z_1 // Point to Constants_G_H_h1
1116 nop.f 0
1117 add GR_ad_q = -0x60, GR_ad_z_1 // Point to Constants_P
1118 }
1119 { .mfi
1120 add GR_ad_z_2 = 0x140, GR_ad_z_1 // Point to Constants_Z_2
1121 nop.f 0
1122 add GR_ad_tbl_2 = 0x180, GR_ad_z_1 // Point to Constants_G_H_h2
1123 };;
1124
1125 { .mfi
1126 add GR_ad_tbl_3 = 0x280, GR_ad_z_1 // Point to Constants_G_H_h3
1127 nop.f 0
1128 extr.u GR_Index1 = GR_signif, 59, 4 // Get high 4 bits of signif
1129 };;
1130
1131 { .mfi
1132 shladd GR_ad_z_1 = GR_Index1, 2, GR_ad_z_1 // Point to Z_1
1133 nop.f 0
1134 extr.u GR_X_0 = GR_signif, 49, 15 // Get high 15 bits of signif.
1135 };;
1136
1137 { .mfi
1138 ld4 GR_Z_1 = [GR_ad_z_1] // Load Z_1
1139 nop.f 0
1140 mov GR_exp_mask = 0x1FFFF // Create exponent mask
1141 }
1142 { .mfi
1143 shladd GR_ad_tbl_1 = GR_Index1, 4, GR_ad_tbl_1 // Point to G_1
1144 nop.f 0
1145 mov GR_Bias = 0x0FFFF // Create exponent bias
1146 };;
1147
1148 { .mfi
1149 ldfps FR_G, FR_H = [GR_ad_tbl_1],8 // Load G_1, H_1
1150 fmerge.se FR_S_hi = f1,FR_XLog_Hi // Form |x|
1151 nop.i 0
1152 };;
1153
1154 { .mmi
1155 getf.exp GR_N = FR_XLog_Hi // Get N = exponent of x+1
1156 ldfd FR_h = [GR_ad_tbl_1] // Load h_1
1157 nop.i 0
1158 };;
1159
1160 { .mfi
1161 ldfe FR_log2_hi = [GR_ad_q],16 // Load log2_hi
1162 nop.f 0
1163 pmpyshr2.u GR_X_1 = GR_X_0,GR_Z_1,15 // Get bits 30-15 of X_0 * Z_1
1164 };;
1165
1166 { .mmi
1167 ldfe FR_log2_lo = [GR_ad_q],16 // Load log2_lo
1168 sub GR_N = GR_N, GR_Bias
1169 mov GR_exp_2tom80 = 0x0ffaf // Exponent of 2^-80
1170 };;
1171
1172 { .mfi
1173 ldfe FR_Q4 = [GR_ad_q],16 // Load Q4
1174 nop.f 0
1175 sub GR_minus_N = GR_Bias, GR_N // Form exponent of 2^(-N)
1176 };;
1177
1178 { .mmf
1179 ldfe FR_Q3 = [GR_ad_q],16 // Load Q3
1180 setf.sig FR_float_N = GR_N // Put integer N into rightmost sign
1181 nop.f 0
1182 };;
1183
1184 { .mmi
1185 ldfe FR_Q2 = [GR_ad_q],16 // Load Q2
1186 nop.m 0
1187 extr.u GR_Index2 = GR_X_1, 6, 4 // Extract bits 6-9 of X_1
1188 };;
1189
1190 { .mmi
1191 ldfe FR_Q1 = [GR_ad_q] // Load Q1
1192 shladd GR_ad_z_2 = GR_Index2, 2, GR_ad_z_2 // Point to Z_2
1193 nop.i 0
1194 };;
1195
1196 { .mmi
1197 ld4 GR_Z_2 = [GR_ad_z_2] // Load Z_2
1198 shladd GR_ad_tbl_2 = GR_Index2, 4, GR_ad_tbl_2 // Point to G_2
1199 nop.i 0
1200 };;
1201
1202 { .mmi
1203 ldfps FR_G2, FR_H2 = [GR_ad_tbl_2],8 // Load G_2, H_2
1204 nop.m 0
1205 nop.i 0
1206 };;
1207
1208 { .mmf
1209 ldfd FR_h2 = [GR_ad_tbl_2] // Load h_2
1210 setf.exp FR_2_to_minus_N = GR_minus_N // Form 2^(-N)
1211 nop.f 0
1212 };;
1213
1214 { .mfi
1215 nop.m 0
1216 nop.f 0
1217 pmpyshr2.u GR_X_2 = GR_X_1,GR_Z_2,15 // Get bits 30-15 of X_1*Z_2
1218 };;
1219
1220 // WE CANNOT USE GR_X_2 IN NEXT 3 CYCLES ("DEAD" ZONE!)
1221 // BECAUSE OF POSSIBLE 10 CLOCKS STALL!
1222 // (Just nops added - nothing to do here)
1223
1224 { .mfi
1225 nop.m 0
1226 nop.f 0
1227 nop.i 0
1228 };;
1229
1230 { .mfi
1231 nop.m 0
1232 nop.f 0
1233 nop.i 0
1234 };;
1235
1236 { .mfi
1237 nop.m 0
1238 nop.f 0
1239 nop.i 0
1240 };;
1241
1242 { .mfi
1243 nop.m 0
1244 nop.f 0
1245 extr.u GR_Index3 = GR_X_2, 1, 5 // Extract bits 1-5 of X_2
1246 };;
1247
1248 { .mfi
1249 shladd GR_ad_tbl_3 = GR_Index3, 4, GR_ad_tbl_3 // Point to G_3
1250 fcvt.xf FR_float_N = FR_float_N
1251 nop.i 0
1252 };;
1253
1254 { .mfi
1255 ldfps FR_G3, FR_H3 = [GR_ad_tbl_3],8 // Load G_3, H_3
1256 nop.f 0
1257 nop.i 0
1258 };;
1259
1260 { .mfi
1261 ldfd FR_h3 = [GR_ad_tbl_3] // Load h_3
1262 fmpy.s1 FR_G = FR_G, FR_G2 // G = G_1 * G_2
1263 nop.i 0
1264 }
1265 { .mfi
1266 nop.m 0
1267 fadd.s1 FR_H = FR_H, FR_H2 // H = H_1 + H_2
1268 nop.i 0
1269 };;
1270
1271 { .mmf
1272 nop.m 0
1273 nop.m 0
1274 fadd.s1 FR_h = FR_h, FR_h2 // h = h_1 + h_2
1275 };;
1276
1277 { .mfi
1278 nop.m 0
1279 fmpy.s1 FR_G = FR_G, FR_G3 // G = (G_1 * G_2)*G_3
1280 nop.i 0
1281 }
1282 { .mfi
1283 nop.m 0
1284 fadd.s1 FR_H = FR_H, FR_H3 // H = (H_1 + H_2)+H_3
1285 nop.i 0
1286 };;
1287
1288 { .mfi
1289 nop.m 0
1290 fadd.s1 FR_h = FR_h, FR_h3 // h = (h_1 + h_2) + h_3
1291 nop.i 0
1292 };;
1293
1294 { .mfi
1295 nop.m 0
1296 fms.s1 FR_r = FR_G, FR_S_hi, f1 // r = G * S_hi - 1
1297 nop.i 0
1298 }
1299 { .mfi
1300 nop.m 0
1301 fma.s1 FR_Y_hi = FR_float_N, FR_log2_hi, FR_H // Y_hi=N*log2_hi+H
1302 nop.i 0
1303 };;
1304
1305 { .mfi
1306 nop.m 0
1307 fma.s1 FR_h = FR_float_N, FR_log2_lo, FR_h // h = N*log2_lo+h
1308 nop.i 0
1309 };;
1310
1311 { .mfi
1312 nop.m 0
1313 fma.s1 FR_poly_lo = FR_r, FR_Q4, FR_Q3 // poly_lo = r * Q4 + Q3
1314 nop.i 0
1315 }
1316 { .mfi
1317 nop.m 0
1318 fmpy.s1 FR_rsq = FR_r, FR_r // rsq = r * r
1319 nop.i 0
1320 };;
1321
1322 { .mfi
1323 nop.m 0
1324 fma.s1 FR_poly_lo = FR_poly_lo, FR_r, FR_Q2 // poly_lo=poly_lo*r+Q2
1325 nop.i 0
1326 }
1327 { .mfi
1328 nop.m 0
1329 fma.s1 FR_rcub = FR_rsq, FR_r, f0 // rcub = r^3
1330 nop.i 0
1331 };;
1332
1333 { .mfi
1334 nop.m 0
1335 fma.s1 FR_poly_hi = FR_Q1, FR_rsq, FR_r // poly_hi = Q1*rsq + r
1336 nop.i 0
1337 };;
1338
1339 { .mfi
1340 nop.m 0
1341 fma.s1 FR_poly_lo = FR_poly_lo, FR_rcub, FR_h//poly_lo=poly_lo*r^3+h
1342 nop.i 0
1343 };;
1344 { .mfi
1345 nop.m 0
1346 fadd.s0 FR_Y_lo = FR_poly_hi, FR_poly_lo // Y_lo=poly_hi+poly_lo
1347 nop.i 0
1348 };;
1349 { .mfb
1350 nop.m 0
1351 fadd.s0 FR_Res = FR_Y_lo,FR_Y_hi // Result=Y_lo+Y_hi
1352 br.ret.sptk b0 // Common exit
1353 };;
1354
1355
1356 // NEAR ONE INTERVAL
1357 near_1:
1358 { .mfi
1359 nop.m 0
1360 frsqrta.s1 FR_Rcp, p0 = FR_2XM1 // Rcp = 1/x reciprocal appr. &SQRT&
1361 nop.i 0
1362 };;
1363
1364 { .mfi
1365 nop.m 0
1366 fma.s1 FR_PV6 = FR_PP5, FR_XM1, FR_PP4 // pv6 = P5*xm1+P4 $POLY$
1367 nop.i 0
1368 }
1369 { .mfi
1370 nop.m 0
1371 fma.s1 FR_QV6 = FR_QQ5, FR_XM1, FR_QQ4 // qv6 = Q5*xm1+Q4 $POLY$
1372 nop.i 0
1373 };;
1374
1375 { .mfi
1376 nop.m 0
1377 fma.s1 FR_PV4 = FR_PP3, FR_XM1, FR_PP2 // pv4 = P3*xm1+P2 $POLY$
1378 nop.i 0
1379 }
1380 { .mfi
1381 nop.m 0
1382 fma.s1 FR_QV4 = FR_QQ3, FR_XM1, FR_QQ2 // qv4 = Q3*xm1+Q2 $POLY$
1383 nop.i 0
1384 };;
1385
1386 { .mfi
1387 nop.m 0
1388 fma.s1 FR_XM12 = FR_XM1, FR_XM1, f0 // xm1^2 = xm1 * xm1 $POLY$
1389 nop.i 0
1390 };;
1391
1392 { .mfi
1393 nop.m 0
1394 fma.s1 FR_PV2 = FR_PP1, FR_XM1, FR_PP0 // pv2 = P1*xm1+P0 $POLY$
1395 nop.i 0
1396 }
1397 { .mfi
1398 nop.m 0
1399 fma.s1 FR_QV2 = FR_QQ1, FR_XM1, FR_QQ0 // qv2 = Q1*xm1+Q0 $POLY$
1400 nop.i 0
1401 };;
1402
1403 { .mfi
1404 nop.m 0
1405 fma.s1 FR_GG = FR_Rcp, FR_2XM1, f0 // g = Rcp * x &SQRT&
1406 nop.i 0
1407 }
1408 { .mfi
1409 nop.m 0
1410 fma.s1 FR_HH = FR_Half, FR_Rcp, f0 // h = 0.5 * Rcp &SQRT&
1411 nop.i 0
1412 };;
1413
1414
1415 { .mfi
1416 nop.m 0
1417 fma.s1 FR_PV3 = FR_XM12, FR_PV6, FR_PV4//pv3=pv6*xm1^2+pv4 $POLY$
1418 nop.i 0
1419 }
1420 { .mfi
1421 nop.m 0
1422 fma.s1 FR_QV3 = FR_XM12, FR_QV6, FR_QV4//qv3=qv6*xm1^2+qv4 $POLY$
1423 nop.i 0
1424 };;
1425
1426
1427 { .mfi
1428 nop.m 0
1429 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
1430 nop.i 0
1431 };;
1432
1433 { .mfi
1434 nop.m 0
1435 fma.s1 FR_PP = FR_XM12, FR_PV3, FR_PV2 //pp=pv3*xm1^2+pv2 $POLY$
1436 nop.i 0
1437 }
1438 { .mfi
1439 nop.m 0
1440 fma.s1 FR_QQ = FR_XM12, FR_QV3, FR_QV2 //qq=qv3*xm1^2+qv2 $POLY$
1441 nop.i 0
1442 };;
1443
1444 { .mfi
1445 nop.m 0
1446 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
1447 nop.i 0
1448 }
1449 { .mfi
1450 nop.m 0
1451 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1452 nop.i 0
1453 };;
1454
1455 { .mfi
1456 nop.m 0
1457 frcpa.s1 FR_Y0,p0 = f1,FR_QQ // y = frcpa(b) #DIV#
1458 nop.i 0
1459 }
1460 { .mfi
1461 nop.m 0
1462 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g*h &SQRT&
1463 nop.i 0
1464 };;
1465
1466 { .mfi
1467 nop.m 0
1468 fma.s1 FR_Q0 = FR_PP,FR_Y0,f0 // q = a*y #DIV#
1469 nop.i 0
1470 }
1471 { .mfi
1472 nop.m 0
1473 fnma.s1 FR_E0 = FR_Y0,FR_QQ,f1 // e = 1 - b*y #DIV#
1474 nop.i 0
1475 };;
1476
1477 { .mfi
1478 nop.m 0
1479 fma.s1 FR_GG = FR_GG, FR_EE, FR_GG // g = g * e + g &SQRT&
1480 nop.i 0
1481 }
1482 { .mfi
1483 nop.m 0
1484 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1485 nop.i 0
1486 };;
1487
1488 { .mfi
1489 nop.m 0
1490 fma.s1 FR_E2 = FR_E0,FR_E0,FR_E0 // e2 = e+e^2 #DIV#
1491 nop.i 0
1492 }
1493 { .mfi
1494 nop.m 0
1495 fma.s1 FR_E1 = FR_E0,FR_E0,f0 // e1 = e^2 #DIV#
1496 nop.i 0
1497 };;
1498
1499 { .mfi
1500 nop.m 0
1501 fnma.s1 FR_EE = FR_GG, FR_HH, FR_Half // e = 0.5 - g * h &SQRT&
1502 nop.i 0
1503 }
1504 { .mfi
1505 nop.m 0
1506 fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
1507 nop.i 0
1508 };;
1509
1510 { .mfi
1511 nop.m 0
1512 fma.s1 FR_Y1 = FR_Y0,FR_E2,FR_Y0 // y1 = y+y*e2 #DIV#
1513 nop.i 0
1514 }
1515 { .mfi
1516 nop.m 0
1517 fma.s1 FR_E3 = FR_E1,FR_E1,FR_E0 // e3 = e+e1^2 #DIV#
1518 nop.i 0
1519 };;
1520
1521 { .mfi
1522 nop.m 0
1523 fma.s1 FR_GG = FR_DD, FR_HH, FR_GG // g = d * h + g &SQRT&
1524 nop.i 0
1525 }
1526 { .mfi
1527 nop.m 0
1528 fma.s1 FR_HH = FR_HH, FR_EE, FR_HH // h = h * e + h &SQRT&
1529 nop.i 0
1530 };;
1531
1532 { .mfi
1533 nop.m 0
1534 fma.s1 FR_Y2 = FR_Y1,FR_E3,FR_Y0 // y2 = y+y1*e3 #DIV#
1535 nop.i 0
1536 }
1537 { .mfi
1538 nop.m 0
1539 fnma.s1 FR_R0 = FR_QQ,FR_Q0,FR_PP // r = a-b*q #DIV#
1540 nop.i 0
1541 };;
1542
1543 { .mfi
1544 nop.m 0
1545 fnma.s1 FR_DD = FR_GG, FR_GG, FR_2XM1 // d = x - g * g &SQRT&
1546 nop.i 0
1547 };;
1548
1549 { .mfi
1550 nop.m 0
1551 fnma.s1 FR_E4 = FR_QQ,FR_Y2,f1 // e4 = 1-b*y2 #DIV#
1552 nop.i 0
1553 }
1554 { .mfi
1555 nop.m 0
1556 fma.s1 FR_X_Hi = FR_R0,FR_Y2,FR_Q0 // x = q+r*y2 #DIV#
1557 nop.i 0
1558 };;
1559
1560 { .mfi
1561 nop.m 0
1562 fma.s1 FR_GL = FR_DD, FR_HH, f0 // gl = d * h &SQRT&
1563 nop.i 0
1564 };;
1565
1566 { .mfi
1567 nop.m 0
1568 fma.s1 FR_Y3 = FR_Y2,FR_E4,FR_Y2 // y3 = y2+y2*e4 #DIV#
1569 nop.i 0
1570 }
1571 { .mfi
1572 nop.m 0
1573 fnma.s1 FR_R1 = FR_QQ,FR_X_Hi,FR_PP // r1 = a-b*x #DIV#
1574 nop.i 0
1575 };;
1576
1577 { .mfi
1578 nop.m 0
1579 fma.s1 FR_HH = FR_GG, FR_X_Hi, f0 // hh = gg * x_hi
1580 nop.i 0
1581 }
1582 { .mfi
1583 nop.m 0
1584 fma.s1 FR_LH = FR_GL, FR_X_Hi, f0 // lh = gl * x_hi
1585 nop.i 0
1586 };;
1587
1588 { .mfi
1589 nop.m 0
1590 fma.s1 FR_X_lo = FR_R1,FR_Y3,f0 // x_lo = r1*y3 #DIV#
1591 nop.i 0
1592 };;
1593
1594 { .mfi
1595 nop.m 0
1596 fma.s1 FR_LL = FR_GL, FR_X_lo, f0 // ll = gl*x_lo
1597 nop.i 0
1598 }
1599 { .mfi
1600 nop.m 0
1601 fma.s1 FR_HL = FR_GG, FR_X_lo, f0 // hl = gg * x_lo
1602 nop.i 0
1603 };;
1604
1605 { .mfi
1606 nop.m 0
1607 fms.s1 FR_Res = FR_GL, f1, FR_LL // res = gl + ll
1608 nop.i 0
1609 };;
1610
1611 { .mfi
1612 nop.m 0
1613 fms.s1 FR_Res = FR_Res, f1, FR_LH // res = res + lh
1614 nop.i 0
1615 };;
1616
1617 { .mfi
1618 nop.m 0
1619 fms.s1 FR_Res = FR_Res, f1, FR_HL // res = res + hl
1620 nop.i 0
1621 };;
1622
1623 { .mfi
1624 nop.m 0
1625 fms.s1 FR_Res = FR_Res, f1, FR_HH // res = res + hh
1626 nop.i 0
1627 };;
1628
1629 { .mfb
1630 nop.m 0
1631 fma.s0 FR_Res = FR_Res, f1, FR_GG // result = res + gg
1632 br.ret.sptk b0 // Exit for near 1 path
1633 };;
1634 // NEAR ONE INTERVAL END
1635
1636
1637
1638
1639 acoshl_lt_pone:
1640 { .mfi
1641 nop.m 0
1642 fmerge.s FR_Arg_X = FR_Arg, FR_Arg
1643 nop.i 0
1644 };;
1645 { .mfb
1646 mov GR_Parameter_TAG = 135
1647 frcpa.s0 FR_Res,p0 = f0,f0 // get QNaN,and raise invalid
1648 br.cond.sptk __libm_error_region // exit if x < 1.0
1649 };;
1650
1651 GLOBAL_LIBM_END(acoshl)
1652 libm_alias_ldouble_other (acosh, acosh)
1653
1654
1655
1656 LOCAL_LIBM_ENTRY(__libm_error_region)
1657 .prologue
1658 { .mfi
1659 add GR_Parameter_Y = -32,sp // Parameter 2 value
1660 nop.f 0
1661 .save ar.pfs,GR_SAVE_PFS
1662 mov GR_SAVE_PFS = ar.pfs // Save ar.pfs
1663 }
1664 { .mfi
1665 .fframe 64
1666 add sp = -64,sp // Create new stack
1667 nop.f 0
1668 mov GR_SAVE_GP = gp // Save gp
1669 };;
1670
1671 { .mmi
1672 stfe [GR_Parameter_Y] = FR_Arg_Y,16 // Parameter 2 to stack
1673 add GR_Parameter_X = 16,sp // Parameter 1 address
1674 .save b0,GR_SAVE_B0
1675 mov GR_SAVE_B0 = b0 // Save b0
1676 };;
1677
1678 .body
1679 { .mib
1680 stfe [GR_Parameter_X] = FR_Arg_X // Parameter 1 to stack
1681 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1682 nop.b 0
1683 }
1684 { .mib
1685 stfe [GR_Parameter_Y] = FR_Res // Parameter 3 to stack
1686 add GR_Parameter_Y = -16,GR_Parameter_Y
1687 br.call.sptk b0 = __libm_error_support# // Error handling function
1688 };;
1689
1690 { .mmi
1691 nop.m 0
1692 nop.m 0
1693 add GR_Parameter_RESULT = 48,sp
1694 };;
1695
1696 { .mmi
1697 ldfe f8 = [GR_Parameter_RESULT] // Get return res
1698 .restore sp
1699 add sp = 64,sp // Restore stack pointer
1700 mov b0 = GR_SAVE_B0 // Restore return address
1701 };;
1702
1703 { .mib
1704 mov gp = GR_SAVE_GP // Restore gp
1705 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1706 br.ret.sptk b0 // Return
1707 };;
1708
1709 LOCAL_LIBM_END(__libm_error_region#)
1710
1711 .type __libm_error_support#,@function
1712 .global __libm_error_support#