initial commit
[glibc.git] / sysdeps / ia64 / fpu / w_tgamma_compat.S
1 .file "tgamma.s"
2
3
4 // Copyright (c) 2001 - 2005, Intel Corporation
5 // All rights reserved.
6 //
7 //
8 // Redistribution and use in source and binary forms, with or without
9 // modification, are permitted provided that the following conditions are
10 // met:
11 //
12 // * Redistributions of source code must retain the above copyright
13 // notice, this list of conditions and the following disclaimer.
14 //
15 // * Redistributions in binary form must reproduce the above copyright
16 // notice, this list of conditions and the following disclaimer in the
17 // documentation and/or other materials provided with the distribution.
18 //
19 // * The name of Intel Corporation may not be used to endorse or promote
20 // products derived from this software without specific prior written
21 // permission.
22
23 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT
25 // LIMITED TO,THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL OR ITS
27 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,
28 // EXEMPLARY,OR CONSEQUENTIAL DAMAGES (INCLUDING,BUT NOT LIMITED TO,
29 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,DATA,OR
30 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
31 // OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY OR TORT (INCLUDING
32 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33 // SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 //
35 // Intel Corporation is the author of this code,and requests that all
36 // problem reports or change requests be submitted to it directly at
37 // http://www.intel.com/software/products/opensource/libraries/num.htm.
38 //
39 //*********************************************************************
40 //
41 // History:
42 // 10/12/01 Initial version
43 // 05/20/02 Cleaned up namespace and sf0 syntax
44 // 02/10/03 Reordered header: .section, .global, .proc, .align
45 // 04/04/03 Changed error codes for overflow and negative integers
46 // 04/10/03 Changed code for overflow near zero handling
47 // 03/31/05 Reformatted delimiters between data tables
48 //
49 //*********************************************************************
50 //
51 //*********************************************************************
52 //
53 // Function: tgamma(x) computes the principle value of the GAMMA
54 // function of x.
55 //
56 //*********************************************************************
57 //
58 // Resources Used:
59 //
60 // Floating-Point Registers: f8-f15
61 // f33-f87
62 //
63 // General Purpose Registers:
64 // r8-r11
65 // r14-r28
66 // r32-r36
67 // r37-r40 (Used to pass arguments to error handling routine)
68 //
69 // Predicate Registers: p6-p15
70 //
71 //*********************************************************************
72 //
73 // IEEE Special Conditions:
74 //
75 // tgamma(+inf) = +inf
76 // tgamma(-inf) = QNaN
77 // tgamma(+/-0) = +/-inf
78 // tgamma(x<0, x - integer) = QNaN
79 // tgamma(SNaN) = QNaN
80 // tgamma(QNaN) = QNaN
81 //
82 //*********************************************************************
83 //
84 // Overview
85 //
86 // The method consists of three cases.
87 //
88 // If 2 <= x < OVERFLOW_BOUNDARY use case tgamma_regular;
89 // else if 0 < x < 2 use case tgamma_from_0_to_2;
90 // else if -(i+1) < x < -i, i = 0...184 use case tgamma_negatives;
91 //
92 // Case 2 <= x < OVERFLOW_BOUNDARY
93 // -------------------------------
94 // Here we use algorithm based on the recursive formula
95 // GAMMA(x+1) = x*GAMMA(x). For that we subdivide interval
96 // [2; OVERFLOW_BOUNDARY] into intervals [16*n; 16*(n+1)] and
97 // approximate GAMMA(x) by polynomial of 22th degree on each
98 // [16*n; 16*n+1], recursive formula is used to expand GAMMA(x)
99 // to [16*n; 16*n+1]. In other words we need to find n, i and r
100 // such that x = 16 * n + i + r where n and i are integer numbers
101 // and r is fractional part of x. So GAMMA(x) = GAMMA(16*n+i+r) =
102 // = (x-1)*(x-2)*...*(x-i)*GAMMA(x-i) =
103 // = (x-1)*(x-2)*...*(x-i)*GAMMA(16*n+r) ~
104 // ~ (x-1)*(x-2)*...*(x-i)*P22n(r).
105 //
106 // Step 1: Reduction
107 // -----------------
108 // N = [x] with truncate
109 // r = x - N, note 0 <= r < 1
110 //
111 // n = N & ~0xF - index of table that contains coefficient of
112 // polynomial approximation
113 // i = N & 0xF - is used in recursive formula
114 //
115 //
116 // Step 2: Approximation
117 // ---------------------
118 // We use factorized minimax approximation polynomials
119 // P22n(r) = A22*(r^2+C01(n)*R+C00(n))*
120 // *(r^2+C11(n)*R+C10(n))*...*(r^2+CA1(n)*R+CA0(n))
121 //
122 // Step 3: Recursion
123 // -----------------
124 // In case when i > 0 we need to multiply P22n(r) by product
125 // R(i)=(x-1)*(x-2)*...*(x-i). To reduce number of fp-instructions
126 // we can calculate R as follow:
127 // R(i) = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-1))*(x-i)) if i is
128 // even or R = ((x-1)*(x-2))*((x-3)*(x-4))*...*((x-(i-2))*(x-(i-1)))*
129 // *(i-1) if i is odd. In both cases we need to calculate
130 // R2(i) = (x^2-3*x+2)*(x^2-7*x+12)*...*(x^2+x+2*j*(2*j-1)) =
131 // = (x^2-3*x+2)*(x^2-7*x+12)*...*((x^2+x)+2*j*(2*(j-1)+(1-2*x))) =
132 // = (RA+2*(2-RB))*(RA+4*(4-RB))*...*(RA+2*j*(2*(j-1)+RB))
133 // where j = 1..[i/2], RA = x^2+x, RB = 1-2*x.
134 //
135 // Step 4: Reconstruction
136 // ----------------------
137 // Reconstruction is just simple multiplication i.e.
138 // GAMMA(x) = P22n(r)*R(i)
139 //
140 // Case 0 < x < 2
141 // --------------
142 // To calculate GAMMA(x) on this interval we do following
143 // if 1 <= x < 1.25 than GAMMA(x) = P15(x-1)
144 // if 1.25 <= x < 1.5 than GAMMA(x) = P15(x-x_min) where
145 // x_min is point of local minimum on [1; 2] interval.
146 // if 1.5 <= x < 2.0 than GAMMA(x) = P15(x-1.5)
147 // and
148 // if 0 < x < 1 than GAMMA(x) = GAMMA(x+1)/x
149 //
150 // Case -(i+1) < x < -i, i = 0...184
151 // ----------------------------------
152 // Here we use the fact that GAMMA(-x) = PI/(x*GAMMA(x)*sin(PI*x)) and
153 // so we need to calculate GAMMA(x), sin(PI*x)/PI. Calculation of
154 // GAMMA(x) is described above.
155 //
156 // Step 1: Reduction
157 // -----------------
158 // Note that period of sin(PI*x) is 2 and range reduction for
159 // sin(PI*x) is like to range reduction for GAMMA(x)
160 // i.e r = x - [x] with exception of cases
161 // when r > 0.5 (in such cases r = 1 - (x - [x])).
162 //
163 // Step 2: Approximation
164 // ---------------------
165 // To approximate sin(PI*x)/PI = sin(PI*(2*n+r))/PI =
166 // = (-1)^n*sin(PI*r)/PI Taylor series is used.
167 // sin(PI*r)/PI ~ S21(r).
168 //
169 // Step 3: Division
170 // ----------------
171 // To calculate 1/(x*GAMMA(x)*S21(r)) we use frcpa instruction
172 // with following Newton-Raphson interations.
173 //
174 //
175 //*********************************************************************
176
177 GR_Sig = r8
178 GR_TAG = r8
179 GR_ad_Data = r9
180 GR_SigRqLin = r10
181 GR_iSig = r11
182 GR_ExpOf1 = r11
183 GR_ExpOf8 = r11
184
185
186 GR_Sig2 = r14
187 GR_Addr_Mask1 = r15
188 GR_Sign_Exp = r16
189 GR_Tbl_Offs = r17
190 GR_Addr_Mask2 = r18
191 GR_ad_Co = r19
192 GR_Bit2 = r19
193 GR_ad_Ce = r20
194 GR_ad_Co7 = r21
195 GR_NzOvfBound = r21
196 GR_ad_Ce7 = r22
197 GR_Tbl_Ind = r23
198 GR_Tbl_16xInd = r24
199 GR_ExpOf025 = r24
200 GR_ExpOf05 = r25
201 GR_0x30033 = r26
202 GR_10 = r26
203 GR_12 = r27
204 GR_185 = r27
205 GR_14 = r28
206 GR_2 = r28
207 GR_fpsr = r28
208
209 GR_SAVE_B0 = r33
210 GR_SAVE_PFS = r34
211 GR_SAVE_GP = r35
212 GR_SAVE_SP = r36
213
214 GR_Parameter_X = r37
215 GR_Parameter_Y = r38
216 GR_Parameter_RESULT = r39
217 GR_Parameter_TAG = r40
218
219
220
221 FR_X = f10
222 FR_Y = f1 // tgamma is single argument function
223 FR_RESULT = f8
224
225 FR_AbsX = f9
226 FR_NormX = f9
227 FR_r02 = f11
228 FR_AbsXp1 = f12
229 FR_X2pX = f13
230 FR_1m2X = f14
231 FR_Rq1 = f14
232 FR_Xt = f15
233
234 FR_r = f33
235 FR_OvfBound = f34
236 FR_Xmin = f35
237 FR_2 = f36
238 FR_Rcp1 = f36
239 FR_Rcp3 = f36
240 FR_4 = f37
241 FR_5 = f38
242 FR_6 = f39
243 FR_8 = f40
244 FR_10 = f41
245 FR_12 = f42
246 FR_14 = f43
247 FR_GAMMA = f43
248 FR_05 = f44
249
250 FR_Rq2 = f45
251 FR_Rq3 = f46
252 FR_Rq4 = f47
253 FR_Rq5 = f48
254 FR_Rq6 = f49
255 FR_Rq7 = f50
256 FR_RqLin = f51
257
258 FR_InvAn = f52
259
260 FR_C01 = f53
261 FR_A15 = f53
262 FR_C11 = f54
263 FR_A14 = f54
264 FR_C21 = f55
265 FR_A13 = f55
266 FR_C31 = f56
267 FR_A12 = f56
268 FR_C41 = f57
269 FR_A11 = f57
270 FR_C51 = f58
271 FR_A10 = f58
272 FR_C61 = f59
273 FR_A9 = f59
274 FR_C71 = f60
275 FR_A8 = f60
276 FR_C81 = f61
277 FR_A7 = f61
278 FR_C91 = f62
279 FR_A6 = f62
280 FR_CA1 = f63
281 FR_A5 = f63
282 FR_C00 = f64
283 FR_A4 = f64
284 FR_rs2 = f64
285 FR_C10 = f65
286 FR_A3 = f65
287 FR_rs3 = f65
288 FR_C20 = f66
289 FR_A2 = f66
290 FR_rs4 = f66
291 FR_C30 = f67
292 FR_A1 = f67
293 FR_rs7 = f67
294 FR_C40 = f68
295 FR_A0 = f68
296 FR_rs8 = f68
297 FR_C50 = f69
298 FR_r2 = f69
299 FR_C60 = f70
300 FR_r3 = f70
301 FR_C70 = f71
302 FR_r4 = f71
303 FR_C80 = f72
304 FR_r7 = f72
305 FR_C90 = f73
306 FR_r8 = f73
307 FR_CA0 = f74
308 FR_An = f75
309
310 FR_S21 = f76
311 FR_S19 = f77
312 FR_Rcp0 = f77
313 FR_Rcp2 = f77
314 FR_S17 = f78
315 FR_S15 = f79
316 FR_S13 = f80
317 FR_S11 = f81
318 FR_S9 = f82
319 FR_S7 = f83
320 FR_S5 = f84
321 FR_S3 = f85
322
323 FR_iXt = f86
324 FR_rs = f87
325
326
327 // Data tables
328 //==============================================================
329 RODATA
330 .align 16
331
332 LOCAL_OBJECT_START(tgamma_data)
333 data8 0x406573FAE561F648 // overflow boundary (171.624376956302739927196)
334 data8 0x3FDD8B618D5AF8FE // point of local minium (0.461632144968362356785)
335 //
336 //[2; 3]
337 data8 0xEF0E85C9AE40ABE2,0x00004000 // C01
338 data8 0xCA2049DDB4096DD8,0x00004000 // C11
339 data8 0x99A203B4DC2D1A8C,0x00004000 // C21
340 data8 0xBF5D9D9C0C295570,0x00003FFF // C31
341 data8 0xE8DD037DEB833BAB,0x00003FFD // C41
342 data8 0xB6AE39A2A36AA03A,0x0000BFFE // C51
343 data8 0x804960DC2850277B,0x0000C000 // C61
344 data8 0xD9F3973841C09F80,0x0000C000 // C71
345 data8 0x9C198A676F8A2239,0x0000C001 // C81
346 data8 0xC98B7DAE02BE3226,0x0000C001 // C91
347 data8 0xE9CAF31AC69301BA,0x0000C001 // CA1
348 data8 0xFBBDD58608A0D172,0x00004000 // C00
349 data8 0xFDD0316D1E078301,0x00004000 // C10
350 data8 0x8630B760468C15E4,0x00004001 // C20
351 data8 0x93EDE20E47D9152E,0x00004001 // C30
352 data8 0xA86F3A38C77D6B19,0x00004001 // C40
353 //[16; 17]
354 data8 0xF87F757F365EE813,0x00004000 // C01
355 data8 0xECA84FBA92759DA4,0x00004000 // C11
356 data8 0xD4E0A55E07A8E913,0x00004000 // C21
357 data8 0xB0EB45E94C8A5F7B,0x00004000 // C31
358 data8 0x8050D6B4F7C8617D,0x00004000 // C41
359 data8 0x8471B111AA691E5A,0x00003FFF // C51
360 data8 0xADAF462AF96585C9,0x0000BFFC // C61
361 data8 0xD327C7A587A8C32B,0x0000BFFF // C71
362 data8 0xDEF5192B4CF5E0F1,0x0000C000 // C81
363 data8 0xBADD64BB205AEF02,0x0000C001 // C91
364 data8 0x9330A24AA67D6860,0x0000C002 // CA1
365 data8 0xF57EEAF36D8C47BE,0x00004000 // C00
366 data8 0x807092E12A251B38,0x00004001 // C10
367 data8 0x8C458F80DEE7ED1C,0x00004001 // C20
368 data8 0x9F30C731DC77F1A6,0x00004001 // C30
369 data8 0xBAC4E7E099C3A373,0x00004001 // C40
370 //[32; 33]
371 data8 0xC3059A415F142DEF,0x00004000 // C01
372 data8 0xB9C1DAC24664587A,0x00004000 // C11
373 data8 0xA7101D910992FFB2,0x00004000 // C21
374 data8 0x8A9522B8E4AA0AB4,0x00004000 // C31
375 data8 0xC76A271E4BA95DCC,0x00003FFF // C41
376 data8 0xC5D6DE2A38DB7FF2,0x00003FFE // C51
377 data8 0xDBA42086997818B2,0x0000BFFC // C61
378 data8 0xB8EDDB1424C1C996,0x0000BFFF // C71
379 data8 0xBF7372FB45524B5D,0x0000C000 // C81
380 data8 0xA03DDE759131580A,0x0000C001 // C91
381 data8 0xFDA6FC4022C1FFE3,0x0000C001 // CA1
382 data8 0x9759ABF797B2533D,0x00004000 // C00
383 data8 0x9FA160C6CF18CEC5,0x00004000 // C10
384 data8 0xB0EFF1E3530E0FCD,0x00004000 // C20
385 data8 0xCCD60D5C470165D1,0x00004000 // C30
386 data8 0xF5E53F6307B0B1C1,0x00004000 // C40
387 //[48; 49]
388 data8 0xAABE577FBCE37F5E,0x00004000 // C01
389 data8 0xA274CAEEB5DF7172,0x00004000 // C11
390 data8 0x91B90B6646C1B924,0x00004000 // C21
391 data8 0xF06718519CA256D9,0x00003FFF // C31
392 data8 0xAA9EE181C0E30263,0x00003FFF // C41
393 data8 0xA07BDB5325CB28D2,0x00003FFE // C51
394 data8 0x86C8B873204F9219,0x0000BFFD // C61
395 data8 0xB0192C5D3E4787D6,0x0000BFFF // C71
396 data8 0xB1E0A6263D4C19EF,0x0000C000 // C81
397 data8 0x93BA32A118EAC9AE,0x0000C001 // C91
398 data8 0xE942A39CD9BEE887,0x0000C001 // CA1
399 data8 0xE838B0957B0D3D0D,0x00003FFF // C00
400 data8 0xF60E0F00074FCF34,0x00003FFF // C10
401 data8 0x89869936AE00C2A5,0x00004000 // C20
402 data8 0xA0FE4E8AA611207F,0x00004000 // C30
403 data8 0xC3B1229CFF1DDAFE,0x00004000 // C40
404 //[64; 65]
405 data8 0x9C00DDF75CDC6183,0x00004000 // C01
406 data8 0x9446AE9C0F6A833E,0x00004000 // C11
407 data8 0x84ABC5083310B774,0x00004000 // C21
408 data8 0xD9BA3A0977B1ED83,0x00003FFF // C31
409 data8 0x989B18C99411D300,0x00003FFF // C41
410 data8 0x886E66402318CE6F,0x00003FFE // C51
411 data8 0x99028C2468F18F38,0x0000BFFD // C61
412 data8 0xAB72D17DCD40CCE1,0x0000BFFF // C71
413 data8 0xA9D9AC9BE42C2EF9,0x0000C000 // C81
414 data8 0x8C11D983AA177AD2,0x0000C001 // C91
415 data8 0xDC779E981C1F0F06,0x0000C001 // CA1
416 data8 0xC1FD4AC85965E8D6,0x00003FFF // C00
417 data8 0xCE3D2D909D389EC2,0x00003FFF // C10
418 data8 0xE7F79980AD06F5D8,0x00003FFF // C20
419 data8 0x88DD9F73C8680B5D,0x00004000 // C30
420 data8 0xA7D6CB2CB2D46F9D,0x00004000 // C40
421 //[80; 81]
422 data8 0x91C7FF4E993430D0,0x00004000 // C01
423 data8 0x8A6E7AB83E45A7E9,0x00004000 // C11
424 data8 0xF72D6382E427BEA9,0x00003FFF // C21
425 data8 0xC9E2E4F9B3B23ED6,0x00003FFF // C31
426 data8 0x8BEFEF56AE05D775,0x00003FFF // C41
427 data8 0xEE9666AB6A185560,0x00003FFD // C51
428 data8 0xA6AFAF5CEFAEE04D,0x0000BFFD // C61
429 data8 0xA877EAFEF1F9C880,0x0000BFFF // C71
430 data8 0xA45BD433048ECA15,0x0000C000 // C81
431 data8 0x86BD1636B774CC2E,0x0000C001 // C91
432 data8 0xD3721BE006E10823,0x0000C001 // CA1
433 data8 0xA97EFABA91854208,0x00003FFF // C00
434 data8 0xB4AF0AEBB3F97737,0x00003FFF // C10
435 data8 0xCC38241936851B0B,0x00003FFF // C20
436 data8 0xF282A6261006EA84,0x00003FFF // C30
437 data8 0x95B8E9DB1BD45BAF,0x00004000 // C40
438 //[96; 97]
439 data8 0x8A1FA3171B35A106,0x00004000 // C01
440 data8 0x830D5B8843890F21,0x00004000 // C11
441 data8 0xE98B0F1616677A23,0x00003FFF // C21
442 data8 0xBDF8347F5F67D4EC,0x00003FFF // C31
443 data8 0x825F15DE34EC055D,0x00003FFF // C41
444 data8 0xD4846186B8AAC7BE,0x00003FFD // C51
445 data8 0xB161093AB14919B1,0x0000BFFD // C61
446 data8 0xA65758EEA4800EF4,0x0000BFFF // C71
447 data8 0xA046B67536FA329C,0x0000C000 // C81
448 data8 0x82BBEC1BCB9E9068,0x0000C001 // C91
449 data8 0xCC9DE2B23BA91B0B,0x0000C001 // CA1
450 data8 0x983B16148AF77F94,0x00003FFF // C00
451 data8 0xA2A4D8EE90FEE5DD,0x00003FFF // C10
452 data8 0xB89446FA37FF481C,0x00003FFF // C20
453 data8 0xDC5572648485FB01,0x00003FFF // C30
454 data8 0x88CD5D7DB976129A,0x00004000 // C40
455 //[112; 113]
456 data8 0x8417098FD62AC5E3,0x00004000 // C01
457 data8 0xFA7896486B779CBB,0x00003FFF // C11
458 data8 0xDEC98B14AF5EEBD1,0x00003FFF // C21
459 data8 0xB48E153C6BF0B5A3,0x00003FFF // C31
460 data8 0xF597B038BC957582,0x00003FFE // C41
461 data8 0xBFC6F0884A415694,0x00003FFD // C51
462 data8 0xBA075A1392BDB5E5,0x0000BFFD // C61
463 data8 0xA4B79E01B44C7DB4,0x0000BFFF // C71
464 data8 0x9D12FA7711BFAB0F,0x0000C000 // C81
465 data8 0xFF24C47C8E108AB4,0x0000C000 // C91
466 data8 0xC7325EC86562606A,0x0000C001 // CA1
467 data8 0x8B47DCD9E1610938,0x00003FFF // C00
468 data8 0x9518B111B70F88B8,0x00003FFF // C10
469 data8 0xA9CC197206F68682,0x00003FFF // C20
470 data8 0xCB98294CC0D7A6A6,0x00003FFF // C30
471 data8 0xFE09493EA9165181,0x00003FFF // C40
472 //[128; 129]
473 data8 0xFE53D03442270D90,0x00003FFF // C01
474 data8 0xF0F857BAEC1993E4,0x00003FFF // C11
475 data8 0xD5FF6D70DBBC2FD3,0x00003FFF // C21
476 data8 0xACDAA5F4988B1074,0x00003FFF // C31
477 data8 0xE92E069F8AD75B54,0x00003FFE // C41
478 data8 0xAEBB64645BD94234,0x00003FFD // C51
479 data8 0xC13746249F39B43C,0x0000BFFD // C61
480 data8 0xA36B74F5B6297A1F,0x0000BFFF // C71
481 data8 0x9A77860DF180F6E5,0x0000C000 // C81
482 data8 0xF9F8457D84410A0C,0x0000C000 // C91
483 data8 0xC2BF44C649EB8597,0x0000C001 // CA1
484 data8 0x81225E7489BCDC0E,0x00003FFF // C00
485 data8 0x8A788A09CE0EED11,0x00003FFF // C10
486 data8 0x9E2E6F86D1B1D89C,0x00003FFF // C20
487 data8 0xBE6866B21CF6CCB5,0x00003FFF // C30
488 data8 0xEE94426EC1486AAE,0x00003FFF // C40
489 //[144; 145]
490 data8 0xF6113E09732A6497,0x00003FFF // C01
491 data8 0xE900D45931B04FC8,0x00003FFF // C11
492 data8 0xCE9FD58F745EBA5D,0x00003FFF // C21
493 data8 0xA663A9636C864C86,0x00003FFF // C31
494 data8 0xDEBF5315896CE629,0x00003FFE // C41
495 data8 0xA05FEA415EBD7737,0x00003FFD // C51
496 data8 0xC750F112BD9C4031,0x0000BFFD // C61
497 data8 0xA2593A35C51C6F6C,0x0000BFFF // C71
498 data8 0x9848E1DA7FB40C8C,0x0000C000 // C81
499 data8 0xF59FEE87A5759A4B,0x0000C000 // C91
500 data8 0xBF00203909E45A1D,0x0000C001 // CA1
501 data8 0xF1D8E157200127E5,0x00003FFE // C00
502 data8 0x81DD5397CB08D487,0x00003FFF // C10
503 data8 0x94C1DC271A8B766F,0x00003FFF // C20
504 data8 0xB3AFAF9B5D6EDDCF,0x00003FFF // C30
505 data8 0xE1FB4C57CA81BE1E,0x00003FFF // C40
506 //[160; 161]
507 data8 0xEEFFE5122AC72FFD,0x00003FFF // C01
508 data8 0xE22F70BB52AD54B3,0x00003FFF // C11
509 data8 0xC84FF021FE993EEA,0x00003FFF // C21
510 data8 0xA0DA2208EB5B2752,0x00003FFF // C31
511 data8 0xD5CDD2FCF8AD2DF5,0x00003FFE // C41
512 data8 0x940BEC6DCD811A59,0x00003FFD // C51
513 data8 0xCC954EF4FD4EBB81,0x0000BFFD // C61
514 data8 0xA1712E29A8C04554,0x0000BFFF // C71
515 data8 0x966B55DFB243521A,0x0000C000 // C81
516 data8 0xF1E6A2B9CEDD0C4C,0x0000C000 // C91
517 data8 0xBBC87BCC031012DB,0x0000C001 // CA1
518 data8 0xE43974E6D2818583,0x00003FFE // C00
519 data8 0xF5702A516B64C5B7,0x00003FFE // C10
520 data8 0x8CEBCB1B32E19471,0x00003FFF // C20
521 data8 0xAAC10F05BB77E0AF,0x00003FFF // C30
522 data8 0xD776EFCAB205CC58,0x00003FFF // C40
523 //[176; 177]
524 data8 0xE8DA614119811E5D,0x00003FFF // C01
525 data8 0xDC415E0288B223D8,0x00003FFF // C11
526 data8 0xC2D2243E44EC970E,0x00003FFF // C21
527 data8 0x9C086664B5307BEA,0x00003FFF // C31
528 data8 0xCE03D7A08B461156,0x00003FFE // C41
529 data8 0x894BE3BAAAB66ADC,0x00003FFD // C51
530 data8 0xD131EDD71A702D4D,0x0000BFFD // C61
531 data8 0xA0A907CDDBE10898,0x0000BFFF // C71
532 data8 0x94CC3CD9C765C808,0x0000C000 // C81
533 data8 0xEEA85F237815FC0D,0x0000C000 // C91
534 data8 0xB8FA04B023E43F91,0x0000C001 // CA1
535 data8 0xD8B2C7D9FCBD7EF9,0x00003FFE // C00
536 data8 0xE9566E93AAE7E38F,0x00003FFE // C10
537 data8 0x8646E78AABEF0255,0x00003FFF // C20
538 data8 0xA32AEDB62E304345,0x00003FFF // C30
539 data8 0xCE83E40280EE7DF0,0x00003FFF // C40
540 //
541 //[2; 3]
542 data8 0xC44FB47E90584083,0x00004001 // C50
543 data8 0xE863EE77E1C45981,0x00004001 // C60
544 data8 0x8AC15BE238B9D70E,0x00004002 // C70
545 data8 0xA5D94B6592350EF4,0x00004002 // C80
546 data8 0xC379DB3E20A148B3,0x00004002 // C90
547 data8 0xDACA49B73974F6C9,0x00004002 // CA0
548 data8 0x810E496A1AFEC895,0x00003FE1 // An
549 //[16; 17]
550 data8 0xE17C0357AAF3F817,0x00004001 // C50
551 data8 0x8BA8804750FBFBFE,0x00004002 // C60
552 data8 0xB18EAB3CB64BEBEE,0x00004002 // C70
553 data8 0xE90AB7015AF1C28F,0x00004002 // C80
554 data8 0xA0AB97CE9E259196,0x00004003 // C90
555 data8 0xF5E0E0A000C2D720,0x00004003 // CA0
556 data8 0xD97F0F87EC791954,0x00004005 // An
557 //[32; 33]
558 data8 0x980C293F3696040D,0x00004001 // C50
559 data8 0xC0DBFFBB948A9A4E,0x00004001 // C60
560 data8 0xFAB54625E9A588A2,0x00004001 // C70
561 data8 0xA7E08176D6050FBF,0x00004002 // C80
562 data8 0xEBAAEC4952270A9F,0x00004002 // C90
563 data8 0xB7479CDAD20550FE,0x00004003 // CA0
564 data8 0xAACD45931C3FF634,0x00004054 // An
565 //[48; 49]
566 data8 0xF5180F0000419AD5,0x00004000 // C50
567 data8 0x9D507D07BFBB2273,0x00004001 // C60
568 data8 0xCEB53F7A13A383E3,0x00004001 // C70
569 data8 0x8BAFEF9E0A49128F,0x00004002 // C80
570 data8 0xC58EF912D39E228C,0x00004002 // C90
571 data8 0x9A88118422BA208E,0x00004003 // CA0
572 data8 0xBD6C0E2477EC12CB,0x000040AC // An
573 //[64; 65]
574 data8 0xD410AC48BF7748DA,0x00004000 // C50
575 data8 0x89399B90AFEBD931,0x00004001 // C60
576 data8 0xB596DF8F77EB8560,0x00004001 // C70
577 data8 0xF6D9445A047FB4A6,0x00004001 // C80
578 data8 0xAF52F0DD65221357,0x00004002 // C90
579 data8 0x8989B45BFC881989,0x00004003 // CA0
580 data8 0xB7FCAE86E6E10D5A,0x0000410B // An
581 //[80; 81]
582 data8 0xBE759740E3B5AA84,0x00004000 // C50
583 data8 0xF8037B1B07D27609,0x00004000 // C60
584 data8 0xA4F6F6C7F0977D4F,0x00004001 // C70
585 data8 0xE131960233BF02C4,0x00004001 // C80
586 data8 0xA06DF43D3922BBE2,0x00004002 // C90
587 data8 0xFC266AB27255A360,0x00004002 // CA0
588 data8 0xD9F4B012EDAFEF2F,0x0000416F // An
589 //[96; 97]
590 data8 0xAEFC84CDA8E1EAA6,0x00004000 // C50
591 data8 0xE5009110DB5F3C8A,0x00004000 // C60
592 data8 0x98F5F48738E7B232,0x00004001 // C70
593 data8 0xD17EE64E21FFDC6B,0x00004001 // C80
594 data8 0x9596F7A7E36145CC,0x00004002 // C90
595 data8 0xEB64DBE50E125CAF,0x00004002 // CA0
596 data8 0xA090530D79E32D2E,0x000041D8 // An
597 //[112; 113]
598 data8 0xA33AEA22A16B2655,0x00004000 // C50
599 data8 0xD682B93BD7D7945C,0x00004000 // C60
600 data8 0x8FC854C6E6E30CC3,0x00004001 // C70
601 data8 0xC5754D828AFFDC7A,0x00004001 // C80
602 data8 0x8D41216B397139C2,0x00004002 // C90
603 data8 0xDE78D746848116E5,0x00004002 // CA0
604 data8 0xB8A297A2DC0630DB,0x00004244 // An
605 //[128; 129]
606 data8 0x99EB00F11D95E292,0x00004000 // C50
607 data8 0xCB005CB911EB779A,0x00004000 // C60
608 data8 0x8879AA2FDFF3A37A,0x00004001 // C70
609 data8 0xBBDA538AD40CAC2C,0x00004001 // C80
610 data8 0x8696D849D311B9DE,0x00004002 // C90
611 data8 0xD41E1C041481199F,0x00004002 // CA0
612 data8 0xEBA1A43D34EE61EE,0x000042B3 // An
613 //[144; 145]
614 data8 0x924F822578AA9F3D,0x00004000 // C50
615 data8 0xC193FAF9D3B36960,0x00004000 // C60
616 data8 0x827AE3A6B68ED0CA,0x00004001 // C70
617 data8 0xB3F52A27EED23F0B,0x00004001 // C80
618 data8 0x811A079FB3C94D79,0x00004002 // C90
619 data8 0xCB94415470B6F8D2,0x00004002 // CA0
620 data8 0x80A0260DCB3EC9AC,0x00004326 // An
621 //[160; 161]
622 data8 0x8BF24091E88B331D,0x00004000 // C50
623 data8 0xB9ADE01187E65201,0x00004000 // C60
624 data8 0xFAE4508F6E7625FE,0x00004000 // C70
625 data8 0xAD516668AD6D7367,0x00004001 // C80
626 data8 0xF8F5FF171154F637,0x00004001 // C90
627 data8 0xC461321268990C82,0x00004002 // CA0
628 data8 0xC3B693F344B0E6FE,0x0000439A // An
629 //
630 //[176; 177]
631 data8 0x868545EB42A258ED,0x00004000 // C50
632 data8 0xB2EF04ACE8BA0E6E,0x00004000 // C60
633 data8 0xF247D22C22E69230,0x00004000 // C70
634 data8 0xA7A1AB93E3981A90,0x00004001 // C80
635 data8 0xF10951733E2C697F,0x00004001 // C90
636 data8 0xBE3359BFAD128322,0x00004002 // CA0
637 data8 0x8000000000000000,0x00003fff
638 //
639 //[160; 161] for negatives
640 data8 0xA76DBD55B2E32D71,0x00003C63 // 1/An
641 //
642 // sin(pi*x)/pi
643 data8 0xBCBC4342112F52A2,0x00003FDE // S21
644 data8 0xFAFCECB86536F655,0x0000BFE3 // S19
645 data8 0x87E4C97F9CF09B92,0x00003FE9 // S17
646 data8 0xEA124C68E704C5CB,0x0000BFED // S15
647 data8 0x9BA38CFD59C8AA1D,0x00003FF2 // S13
648 data8 0x99C0B552303D5B21,0x0000BFF6 // S11
649 //
650 //[176; 177] for negatives
651 data8 0xBA5D5869211696FF,0x00003BEC // 1/An
652 //
653 // sin(pi*x)/pi
654 data8 0xD63402E79A853175,0x00003FF9 // S9
655 data8 0xC354723906DB36BA,0x0000BFFC // S7
656 data8 0xCFCE5A015E236291,0x00003FFE // S5
657 data8 0xD28D3312983E9918,0x0000BFFF // S3
658 //
659 //
660 // [1.0;1.25]
661 data8 0xA405530B067ECD3C,0x0000BFFC // A15
662 data8 0xF5B5413F95E1C282,0x00003FFD // A14
663 data8 0xC4DED71C782F76C8,0x0000BFFE // A13
664 data8 0xECF7DDDFD27C9223,0x00003FFE // A12
665 data8 0xFB73D31793068463,0x0000BFFE // A11
666 data8 0xFF173B7E66FD1D61,0x00003FFE // A10
667 data8 0xFFA5EF3959089E94,0x0000BFFE // A9
668 data8 0xFF8153BD42E71A4F,0x00003FFE // A8
669 data8 0xFEF9CAEE2CB5B533,0x0000BFFE // A7
670 data8 0xFE3F02E5EDB6811E,0x00003FFE // A6
671 data8 0xFB64074CED2658FB,0x0000BFFE // A5
672 data8 0xFB52882A095B18A4,0x00003FFE // A4
673 data8 0xE8508C7990A0DAC0,0x0000BFFE // A3
674 data8 0xFD32C611D8A881D0,0x00003FFE // A2
675 data8 0x93C467E37DB0C536,0x0000BFFE // A1
676 data8 0x8000000000000000,0x00003FFF // A0
677 //
678 // [1.25;1.5]
679 data8 0xD038092400619677,0x0000BFF7 // A15
680 data8 0xEA6DE925E6EB8C8F,0x00003FF3 // A14
681 data8 0xC53F83645D4597FC,0x0000BFF7 // A13
682 data8 0xE366DB2FB27B7ECD,0x00003FF7 // A12
683 data8 0xAC8FD5E11F6EEAD8,0x0000BFF8 // A11
684 data8 0xFB14010FB3697785,0x00003FF8 // A10
685 data8 0xB6F91CB5C371177B,0x0000BFF9 // A9
686 data8 0x85A262C6F8FEEF71,0x00003FFA // A8
687 data8 0xC038E6E3261568F9,0x0000BFFA // A7
688 data8 0x8F4BDE8883232364,0x00003FFB // A6
689 data8 0xBCFBBD5786537E9A,0x0000BFFB // A5
690 data8 0xA4C08BAF0A559479,0x00003FFC // A4
691 data8 0x85D74FA063E81476,0x0000BFFC // A3
692 data8 0xDB629FB9BBDC1C4E,0x00003FFD // A2
693 data8 0xF4F8FBC7C0C9D317,0x00003FC6 // A1
694 data8 0xE2B6E4153A57746C,0x00003FFE // A0
695 //
696 // [1.25;1.5]
697 data8 0x9533F9D3723B448C,0x0000BFF2 // A15
698 data8 0xF1F75D3C561CBBAF,0x00003FF5 // A14
699 data8 0xBA55A9A1FC883523,0x0000BFF8 // A13
700 data8 0xB5D5E9E5104FA995,0x00003FFA // A12
701 data8 0xFD84F35B70CD9AE2,0x0000BFFB // A11
702 data8 0x87445235F4688CC5,0x00003FFD // A10
703 data8 0xE7F236EBFB9F774E,0x0000BFFD // A9
704 data8 0xA6605F2721F787CE,0x00003FFE // A8
705 data8 0xCF579312AD7EAD72,0x0000BFFE // A7
706 data8 0xE96254A2407A5EAC,0x00003FFE // A6
707 data8 0xF41312A8572ED346,0x0000BFFE // A5
708 data8 0xF9535027C1B1F795,0x00003FFE // A4
709 data8 0xE7E82D0C613A8DE4,0x0000BFFE // A3
710 data8 0xFD23CD9741B460B8,0x00003FFE // A2
711 data8 0x93C30FD9781DBA88,0x0000BFFE // A1
712 data8 0xFFFFF1781FDBEE84,0x00003FFE // A0
713 LOCAL_OBJECT_END(tgamma_data)
714
715
716 //==============================================================
717 // Code
718 //==============================================================
719
720 .section .text
721 GLOBAL_LIBM_ENTRY(tgamma)
722 { .mfi
723 getf.exp GR_Sign_Exp = f8
724 fma.s1 FR_1m2X = f8,f1,f8 // 2x
725 addl GR_ad_Data = @ltoff(tgamma_data), gp
726 }
727 { .mfi
728 mov GR_ExpOf8 = 0x10002 // 8
729 fcvt.fx.trunc.s1 FR_iXt = f8 // [x]
730 mov GR_ExpOf05 = 0xFFFE // 0.5
731 };;
732 { .mfi
733 getf.sig GR_Sig = f8
734 fma.s1 FR_2 = f1,f1,f1 // 2
735 mov GR_Addr_Mask1 = 0x780
736 }
737 { .mlx
738 setf.exp FR_8 = GR_ExpOf8
739 movl GR_10 = 0x4024000000000000
740 };;
741 { .mfi
742 ld8 GR_ad_Data = [GR_ad_Data]
743 fcmp.lt.s1 p14,p15 = f8,f0
744 tbit.z p12,p13 = GR_Sign_Exp,0x10 // p13 if x >= 2
745 }
746 { .mlx
747 and GR_Bit2 = 4,GR_Sign_Exp
748 movl GR_12 = 0x4028000000000000
749 };;
750 { .mfi
751 setf.d FR_10 = GR_10
752 fma.s1 FR_r02 = f8,f1,f0
753 extr.u GR_Tbl_Offs = GR_Sig,58,6
754 }
755 { .mfi
756 (p12) mov GR_Addr_Mask1 = r0
757 fma.s1 FR_NormX = f8,f1,f0
758 cmp.ne p8,p0 = GR_Bit2,r0
759 };;
760 { .mfi
761 (p8) shladd GR_Tbl_Offs = GR_Tbl_Offs,4,r0
762 fclass.m p10,p0 = f8,0x1E7 // Test x for NaTVal, NaN, +/-0, +/-INF
763 tbit.nz p11,p0 = GR_Sign_Exp,1
764 }
765 { .mlx
766 add GR_Addr_Mask2 = GR_Addr_Mask1,GR_Addr_Mask1
767 movl GR_14 = 0x402C000000000000
768 };;
769 .pred.rel "mutex",p14,p15
770 { .mfi
771 setf.d FR_12 = GR_12
772 (p14) fma.s1 FR_1m2X = f1,f1,FR_1m2X // RB=1-2|x|
773 tbit.nz p8,p9 = GR_Sign_Exp,0
774 }
775 { .mfi
776 ldfpd FR_OvfBound,FR_Xmin = [GR_ad_Data],16
777 (p15) fms.s1 FR_1m2X = f1,f1,FR_1m2X // RB=1-2|x|
778 (p11) shladd GR_Tbl_Offs = GR_Tbl_Offs,2,r0
779 };;
780 .pred.rel "mutex",p9,p8
781 { .mfi
782 setf.d FR_14 = GR_14
783 fma.s1 FR_4 = FR_2,FR_2,f0
784 (p8) and GR_Tbl_Offs = GR_Tbl_Offs, GR_Addr_Mask1
785 }
786 { .mfi
787 setf.exp FR_05 = GR_ExpOf05
788 fma.s1 FR_6 = FR_2,FR_2,FR_2
789 (p9) and GR_Tbl_Offs = GR_Tbl_Offs, GR_Addr_Mask2
790 };;
791 .pred.rel "mutex",p9,p8
792 { .mfi
793 (p8) shladd GR_ad_Co = GR_Tbl_Offs,1,GR_ad_Data
794 fcvt.xf FR_Xt = FR_iXt // [x]
795 (p15) tbit.z.unc p11,p0 = GR_Sign_Exp,0x10 // p11 if 0 < x < 2
796 }
797 { .mfi
798 (p9) add GR_ad_Co = GR_ad_Data,GR_Tbl_Offs
799 fma.s1 FR_5 = FR_2,FR_2,f1
800 (p15) cmp.lt.unc p7,p6 = GR_ExpOf05,GR_Sign_Exp // p7 if 0 < x < 1
801 };;
802 { .mfi
803 add GR_ad_Ce = 16,GR_ad_Co
804 (p11) frcpa.s1 FR_Rcp0,p0 = f1,f8
805 sub GR_Tbl_Offs = GR_ad_Co,GR_ad_Data
806 }
807 { .mfb
808 ldfe FR_C01 = [GR_ad_Co],32
809 (p7) fms.s1 FR_r02 = FR_r02,f1,f1
810 // jump if x is NaTVal, NaN, +/-0, +/-INF
811 (p10) br.cond.spnt tgamma_spec
812 };;
813 .pred.rel "mutex",p14,p15
814 { .mfi
815 ldfe FR_C11 = [GR_ad_Ce],32
816 (p14) fms.s1 FR_X2pX = f8,f8,f8 // RA=x^2+|x|
817 shr GR_Tbl_Ind = GR_Tbl_Offs,8
818 }
819 { .mfb
820 ldfe FR_C21 = [GR_ad_Co],32
821 (p15) fma.s1 FR_X2pX = f8,f8,f8 // RA=x^2+x
822 // jump if 0 < x < 2
823 (p11) br.cond.spnt tgamma_from_0_to_2
824 };;
825 { .mfi
826 ldfe FR_C31 = [GR_ad_Ce],32
827 fma.s1 FR_Rq2 = FR_2,f1,FR_1m2X // 2 + B
828 cmp.ltu p7,p0=0xB,GR_Tbl_Ind
829 }
830 { .mfb
831 ldfe FR_C41 = [GR_ad_Co],32
832 fma.s1 FR_Rq3 = FR_2,FR_2,FR_1m2X // 4 + B
833 // jump if GR_Tbl_Ind > 11, i.e |x| is more than 192
834 (p7) br.cond.spnt tgamma_spec_res
835 };;
836 { .mfi
837 ldfe FR_C51 = [GR_ad_Ce],32
838 fma.s1 FR_Rq4 = FR_6,f1,FR_1m2X // 6 + B
839 shr GR_Tbl_Offs = GR_Tbl_Offs,1
840 }
841 { .mfi
842 ldfe FR_C61 = [GR_ad_Co],32
843 fma.s1 FR_Rq5 = FR_4,FR_2,FR_1m2X // 8 + B
844 nop.i 0
845 };;
846 { .mfi
847 ldfe FR_C71 = [GR_ad_Ce],32
848 (p14) fms.s1 FR_r = FR_Xt,f1,f8 // r = |x| - [|x|]
849 shr GR_Tbl_16xInd = GR_Tbl_Offs,3
850 }
851 { .mfi
852 ldfe FR_C81 = [GR_ad_Co],32
853 (p15) fms.s1 FR_r = f8,f1,FR_Xt // r = x - [x]
854 add GR_ad_Data = 0xC00,GR_ad_Data
855 };;
856 { .mfi
857 ldfe FR_C91 = [GR_ad_Ce],32
858 fma.s1 FR_Rq6 = FR_5,FR_2,FR_1m2X // 10 + B
859 (p14) mov GR_0x30033 = 0x30033
860 }
861 { .mfi
862 ldfe FR_CA1 = [GR_ad_Co],32
863 fma.s1 FR_Rq7 = FR_6,FR_2,FR_1m2X // 12 + B
864 sub GR_Tbl_Offs = GR_Tbl_Offs,GR_Tbl_16xInd
865 };;
866 { .mfi
867 ldfe FR_C00 = [GR_ad_Ce],32
868 fma.s1 FR_Rq1 = FR_Rq1,FR_2,FR_X2pX // (x-1)*(x-2)
869 (p13) cmp.eq.unc p8,p0 = r0,GR_Tbl_16xInd // index is 0 i.e. arg from [2;16)
870 }
871 { .mfi
872 ldfe FR_C10 = [GR_ad_Co],32
873 (p14) fms.s1 FR_AbsX = f0,f0,FR_NormX // absolute value of argument
874 add GR_ad_Co7 = GR_ad_Data,GR_Tbl_Offs
875 };;
876 { .mfi
877 ldfe FR_C20 = [GR_ad_Ce],32
878 fma.s1 FR_Rq2 = FR_Rq2,FR_4,FR_X2pX // (x-3)*(x-4)
879 add GR_ad_Ce7 = 16,GR_ad_Co7
880 }
881 { .mfi
882 ldfe FR_C30 = [GR_ad_Co],32
883 fma.s1 FR_Rq3 = FR_Rq3,FR_6,FR_X2pX // (x-5)*(x-6)
884 nop.i 0
885 };;
886 { .mfi
887 ldfe FR_C40 = [GR_ad_Ce],32
888 fma.s1 FR_Rq4 = FR_Rq4,FR_8,FR_X2pX // (x-7)*(x-8)
889 (p14) cmp.leu.unc p7,p0 = GR_0x30033,GR_Sign_Exp
890 }
891 { .mfb
892 ldfe FR_C50 = [GR_ad_Co7],32
893 fma.s1 FR_Rq5 = FR_Rq5,FR_10,FR_X2pX // (x-9)*(x-10)
894 // jump if x is less or equal to -2^52, i.e. x is big negative integer
895 (p7) br.cond.spnt tgamma_singularity
896 };;
897 { .mfi
898 ldfe FR_C60 = [GR_ad_Ce7],32
899 fma.s1 FR_C01 = FR_C01,f1,FR_r
900 add GR_ad_Ce = 0x560,GR_ad_Data
901 }
902 { .mfi
903 ldfe FR_C70 = [GR_ad_Co7],32
904 fma.s1 FR_rs = f0,f0,FR_r // reduced arg for sin(pi*x)
905 add GR_ad_Co = 0x550,GR_ad_Data
906 };;
907 { .mfi
908 ldfe FR_C80 = [GR_ad_Ce7],32
909 fma.s1 FR_C11 = FR_C11,f1,FR_r
910 nop.i 0
911 }
912 { .mfi
913 ldfe FR_C90 = [GR_ad_Co7],32
914 fma.s1 FR_C21 = FR_C21,f1,FR_r
915 nop.i 0
916 };;
917 .pred.rel "mutex",p12,p13
918 { .mfi
919 (p13) getf.sig GR_iSig = FR_iXt
920 fcmp.lt.s1 p11,p0 = FR_05,FR_r
921 mov GR_185 = 185
922 }
923 { .mfi
924 nop.m 0
925 fma.s1 FR_Rq6 = FR_Rq6,FR_12,FR_X2pX // (x-11)*(x-12)
926 nop.i 0
927 };;
928 { .mfi
929 ldfe FR_CA0 = [GR_ad_Ce7],32
930 fma.s1 FR_C31 = FR_C31,f1,FR_r
931 (p12) mov GR_iSig = 0
932 }
933 { .mfi
934 ldfe FR_An = [GR_ad_Co7],0x80
935 fma.s1 FR_C41 = FR_C41,f1,FR_r
936 nop.i 0
937 };;
938 { .mfi
939 (p14) getf.sig GR_Sig = FR_r
940 fma.s1 FR_C51 = FR_C51,f1,FR_r
941 (p14) sub GR_iSig = r0,GR_iSig
942 }
943 { .mfi
944 ldfe FR_S21 = [GR_ad_Co],32
945 fma.s1 FR_C61 = FR_C61,f1,FR_r
946 nop.i 0
947 };;
948 { .mfi
949 ldfe FR_S19 = [GR_ad_Ce],32
950 fma.s1 FR_C71 = FR_C71,f1,FR_r
951 and GR_SigRqLin = 0xF,GR_iSig
952 }
953 { .mfi
954 ldfe FR_S17 = [GR_ad_Co],32
955 fma.s1 FR_C81 = FR_C81,f1,FR_r
956 mov GR_2 = 2
957 };;
958 { .mfi
959 (p14) ldfe FR_InvAn = [GR_ad_Co7]
960 fma.s1 FR_C91 = FR_C91,f1,FR_r
961 // if significand of r is 0 tnan argument is negative integer
962 (p14) cmp.eq.unc p12,p0 = r0,GR_Sig
963 }
964 { .mfb
965 (p8) sub GR_SigRqLin = GR_SigRqLin,GR_2 // subtract 2 if 2 <= x < 16
966 fma.s1 FR_CA1 = FR_CA1,f1,FR_r
967 // jump if x is negative integer such that -2^52 < x < -185
968 (p12) br.cond.spnt tgamma_singularity
969 };;
970 { .mfi
971 setf.sig FR_Xt = GR_SigRqLin
972 (p11) fms.s1 FR_rs = f1,f1,FR_r
973 (p14) cmp.ltu.unc p7,p0 = GR_185,GR_iSig
974 }
975 { .mfb
976 ldfe FR_S15 = [GR_ad_Ce],32
977 fma.s1 FR_Rq7 = FR_Rq7,FR_14,FR_X2pX // (x-13)*(x-14)
978 // jump if x is noninteger such that -2^52 < x < -185
979 (p7) br.cond.spnt tgamma_underflow
980 };;
981 { .mfi
982 ldfe FR_S13 = [GR_ad_Co],48
983 fma.s1 FR_C01 = FR_C01,FR_r,FR_C00
984 and GR_Sig2 = 0xE,GR_SigRqLin
985 }
986 { .mfi
987 ldfe FR_S11 = [GR_ad_Ce],48
988 fma.s1 FR_C11 = FR_C11,FR_r,FR_C10
989 nop.i 0
990 };;
991 { .mfi
992 ldfe FR_S9 = [GR_ad_Co],32
993 fma.s1 FR_C21 = FR_C21,FR_r,FR_C20
994 // should we mul by polynomial of recursion?
995 cmp.eq p13,p12 = r0,GR_SigRqLin
996 }
997 { .mfi
998 ldfe FR_S7 = [GR_ad_Ce],32
999 fma.s1 FR_C31 = FR_C31,FR_r,FR_C30
1000 nop.i 0
1001 };;
1002 { .mfi
1003 ldfe FR_S5 = [GR_ad_Co],32
1004 fma.s1 FR_C41 = FR_C41,FR_r,FR_C40
1005 nop.i 0
1006 }
1007 { .mfi
1008 ldfe FR_S3 = [GR_ad_Ce],32
1009 fma.s1 FR_C51 = FR_C51,FR_r,FR_C50
1010 nop.i 0
1011 };;
1012 { .mfi
1013 nop.m 0
1014 fma.s1 FR_C61 = FR_C61,FR_r,FR_C60
1015 nop.i 0
1016 }
1017 { .mfi
1018 nop.m 0
1019 fma.s1 FR_C71 = FR_C71,FR_r,FR_C70
1020 nop.i 0
1021 };;
1022 { .mfi
1023 nop.m 0
1024 fma.s1 FR_C81 = FR_C81,FR_r,FR_C80
1025 nop.i 0
1026 }
1027 { .mfi
1028 nop.m 0
1029 fma.s1 FR_C91 = FR_C91,FR_r,FR_C90
1030 nop.i 0
1031 };;
1032 { .mfi
1033 nop.m 0
1034 fma.s1 FR_CA1 = FR_CA1,FR_r,FR_CA0
1035 nop.i 0
1036 }
1037 { .mfi
1038 nop.m 0
1039 fma.s1 FR_C01 = FR_C01,FR_C11,f0
1040 nop.i 0
1041 };;
1042 { .mfi
1043 nop.m 0
1044 fma.s1 FR_C21 = FR_C21,FR_C31,f0
1045 nop.i 0
1046 }
1047 { .mfi
1048 nop.m 0
1049 fma.s1 FR_rs2 = FR_rs,FR_rs,f0
1050 (p12) cmp.lt.unc p7,p0 = 2,GR_Sig2 // should mul by FR_Rq2?
1051 };;
1052 { .mfi
1053 nop.m 0
1054 fma.s1 FR_C41 = FR_C41,FR_C51,f0
1055 nop.i 0
1056 }
1057 { .mfi
1058 nop.m 0
1059 (p7) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq2,f0
1060 (p12) cmp.lt.unc p9,p0 = 6,GR_Sig2 // should mul by FR_Rq4?
1061 };;
1062 { .mfi
1063 nop.m 0
1064 fma.s1 FR_C61 = FR_C61,FR_C71,f0
1065 (p15) cmp.eq p11,p0 = r0,r0
1066 }
1067 { .mfi
1068 nop.m 0
1069 (p9) fma.s1 FR_Rq3 = FR_Rq3,FR_Rq4,f0
1070 (p12) cmp.lt.unc p8,p0 = 10,GR_Sig2 // should mul by FR_Rq6?
1071 };;
1072 { .mfi
1073 nop.m 0
1074 fma.s1 FR_C81 = FR_C81,FR_C91,f0
1075 nop.i 0
1076 }
1077 { .mfi
1078 nop.m 0
1079 (p8) fma.s1 FR_Rq5 = FR_Rq5,FR_Rq6,f0
1080 (p14) cmp.ltu p0,p11 = 0x9,GR_Tbl_Ind
1081 };;
1082 { .mfi
1083 nop.m 0
1084 fcvt.xf FR_RqLin = FR_Xt
1085 nop.i 0
1086 }
1087 { .mfi
1088 nop.m 0
1089 (p11) fma.s1 FR_CA1 = FR_CA1,FR_An,f0
1090 nop.i 0
1091 };;
1092 { .mfi
1093 nop.m 0
1094 fma.s1 FR_S21 = FR_S21,FR_rs2,FR_S19
1095 nop.i 0
1096 }
1097 { .mfi
1098 nop.m 0
1099 fma.s1 FR_S17 = FR_S17,FR_rs2,FR_S15
1100 nop.i 0
1101 };;
1102 { .mfi
1103 nop.m 0
1104 fma.s1 FR_C01 = FR_C01,FR_C21,f0
1105 nop.i 0
1106 }
1107 { .mfi
1108 nop.m 0
1109 fma.s1 FR_rs4 = FR_rs2,FR_rs2,f0
1110 (p12) cmp.lt.unc p8,p0 = 4,GR_Sig2 // should mul by FR_Rq3?
1111 };;
1112 { .mfi
1113 nop.m 0
1114 (p8) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq3,f0
1115 nop.i 0
1116 }
1117 { .mfi
1118 nop.m 0
1119 fma.s1 FR_S13 = FR_S13,FR_rs2,FR_S11
1120 (p12) cmp.lt.unc p9,p0 = 12,GR_Sig2 // should mul by FR_Rq7?
1121 };;
1122 { .mfi
1123 nop.m 0
1124 fma.s1 FR_C41 = FR_C41,FR_C61,f0
1125 nop.i 0
1126 }
1127 { .mfi
1128 nop.m 0
1129 (p9) fma.s1 FR_Rq5 = FR_Rq5,FR_Rq7,f0
1130 nop.i 0
1131 };;
1132 { .mfi
1133 nop.m 0
1134 fma.s1 FR_C81 = FR_C81,FR_CA1,f0
1135 nop.i 0
1136 }
1137 { .mfi
1138 nop.m 0
1139 fma.s1 FR_S9 = FR_S9,FR_rs2,FR_S7
1140 nop.i 0
1141 };;
1142 { .mfi
1143 nop.m 0
1144 fma.s1 FR_S5 = FR_S5,FR_rs2,FR_S3
1145 nop.i 0
1146 };;
1147 { .mfi
1148 nop.m 0
1149 fma.s1 FR_rs3 = FR_rs2,FR_rs,f0
1150 (p12) tbit.nz.unc p6,p0 = GR_SigRqLin,0
1151 }
1152 { .mfi
1153 nop.m 0
1154 fma.s1 FR_rs8 = FR_rs4,FR_rs4,f0
1155 nop.i 0
1156 };;
1157 { .mfi
1158 nop.m 0
1159 fma.s1 FR_S21 = FR_S21,FR_rs4,FR_S17
1160 mov GR_ExpOf1 = 0x2FFFF
1161 }
1162 { .mfi
1163 nop.m 0
1164 (p6) fms.s1 FR_RqLin = FR_AbsX,f1,FR_RqLin
1165 (p12) cmp.lt.unc p8,p0 = 8,GR_Sig2 // should mul by FR_Rq5?
1166 };;
1167 { .mfi
1168 nop.m 0
1169 fma.s1 FR_C01 = FR_C01,FR_C41,f0
1170 nop.i 0
1171 }
1172 { .mfi
1173 nop.m 0
1174 (p8) fma.s1 FR_Rq1 = FR_Rq1,FR_Rq5,f0
1175 (p14) cmp.gtu.unc p7,p0 = GR_Sign_Exp,GR_ExpOf1
1176 };;
1177 { .mfi
1178 nop.m 0
1179 fma.s1 FR_S13 = FR_S13,FR_rs4,FR_S9
1180 nop.i 0
1181 }
1182 { .mfi
1183 nop.m 0
1184 (p7) fma.s1 FR_C81 = FR_C81,FR_AbsX,f0
1185 nop.i 0
1186 };;
1187 { .mfi
1188 nop.m 0
1189 (p14) fma.s1 FR_AbsXp1 = f1,f1,FR_AbsX // |x|+1
1190 nop.i 0
1191 }
1192 { .mfi
1193 nop.m 0
1194 (p15) fcmp.lt.unc.s1 p0,p10 = FR_AbsX,FR_OvfBound // x >= overflow_boundary
1195 nop.i 0
1196 };;
1197 { .mfi
1198 nop.m 0
1199 fma.s1 FR_rs7 = FR_rs4,FR_rs3,f0
1200 nop.i 0
1201 }
1202 { .mfi
1203 nop.m 0
1204 fma.s1 FR_S5 = FR_S5,FR_rs3,FR_rs
1205 nop.i 0
1206 };;
1207 { .mib
1208 (p14) cmp.lt p13,p0 = r0,r0 // set p13 to 0 if x < 0
1209 (p12) cmp.eq.unc p8,p9 = 1,GR_SigRqLin
1210 (p10) br.cond.spnt tgamma_spec_res
1211 };;
1212 { .mfi
1213 getf.sig GR_Sig = FR_iXt
1214 (p6) fma.s1 FR_Rq1 = FR_Rq1,FR_RqLin,f0
1215 // should we mul by polynomial of recursion?
1216 (p15) cmp.eq.unc p0,p11 = r0,GR_SigRqLin
1217 }
1218 { .mfb
1219 nop.m 0
1220 fma.s1 FR_GAMMA = FR_C01,FR_C81,f0
1221 (p11) br.cond.spnt tgamma_positives
1222 };;
1223 { .mfi
1224 nop.m 0
1225 fma.s1 FR_S21 = FR_S21,FR_rs8,FR_S13
1226 nop.i 0
1227 }
1228 { .mfb
1229 nop.m 0
1230 (p13) fma.d.s0 f8 = FR_C01,FR_C81,f0
1231 (p13) br.ret.spnt b0
1232 };;
1233 .pred.rel "mutex",p8,p9
1234 { .mfi
1235 nop.m 0
1236 (p9) fma.s1 FR_GAMMA = FR_GAMMA,FR_Rq1,f0
1237 tbit.z p6,p7 = GR_Sig,0 // p6 if sin<0, p7 if sin>0
1238 }
1239 { .mfi
1240 nop.m 0
1241 (p8) fma.s1 FR_GAMMA = FR_GAMMA,FR_RqLin,f0
1242 nop.i 0
1243 };;
1244 { .mfi
1245 nop.m 0
1246 fma.s1 FR_S21 = FR_S21,FR_rs7,FR_S5
1247 nop.i 0
1248 };;
1249 .pred.rel "mutex",p6,p7
1250 { .mfi
1251 nop.m 0
1252 (p6) fnma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
1253 nop.i 0
1254 }
1255 { .mfi
1256 nop.m 0
1257 (p7) fma.s1 FR_GAMMA = FR_GAMMA,FR_S21,f0
1258 mov GR_Sig2 = 1
1259 };;
1260 { .mfi
1261 nop.m 0
1262 frcpa.s1 FR_Rcp0,p0 = f1,FR_GAMMA
1263 cmp.ltu p13,p0 = GR_Sign_Exp,GR_ExpOf1
1264 };;
1265 // NR method: ineration #1
1266 { .mfi
1267 (p13) getf.exp GR_Sign_Exp = FR_AbsX
1268 fnma.s1 FR_Rcp1 = FR_Rcp0,FR_GAMMA,f1 // t = 1 - r0*x
1269 (p13) shl GR_Sig2 = GR_Sig2,63
1270 };;
1271 { .mfi
1272 (p13) getf.sig GR_Sig = FR_AbsX
1273 nop.f 0
1274 (p13) mov GR_NzOvfBound = 0xFBFF
1275 };;
1276 { .mfi
1277 (p13) cmp.ltu.unc p8,p0 = GR_Sign_Exp,GR_NzOvfBound // p8 <- overflow
1278 nop.f 0
1279 (p13) cmp.eq.unc p9,p0 = GR_Sign_Exp,GR_NzOvfBound
1280 };;
1281 { .mfb
1282 nop.m 0
1283 (p13) fma.d.s0 FR_X = f1,f1,f8 // set deno & inexact flags
1284 (p8) br.cond.spnt tgamma_ovf_near_0 //tgamma_neg_overflow
1285 };;
1286 { .mib
1287 nop.m 0
1288 (p9) cmp.eq.unc p8,p0 = GR_Sig,GR_Sig2
1289 (p8) br.cond.spnt tgamma_ovf_near_0_boundary //tgamma_neg_overflow
1290 };;
1291 { .mfi
1292 nop.m 0
1293 fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
1294 nop.i 0
1295 };;
1296 // NR method: ineration #2
1297 { .mfi
1298 nop.m 0
1299 fnma.s1 FR_Rcp2 = FR_Rcp1,FR_GAMMA,f1 // t = 1 - r1*x
1300 nop.i 0
1301 };;
1302 { .mfi
1303 nop.m 0
1304 fma.s1 FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
1305 nop.i 0
1306 };;
1307 // NR method: ineration #3
1308 { .mfi
1309 nop.m 0
1310 fnma.s1 FR_Rcp3 = FR_Rcp2,FR_GAMMA,f1 // t = 1 - r2*x
1311 nop.i 0
1312 }
1313 { .mfi
1314 nop.m 0
1315 (p13) fma.s1 FR_Rcp2 = FR_Rcp2,FR_AbsXp1,f0
1316 (p14) cmp.ltu p10,p11 = 0x9,GR_Tbl_Ind
1317 };;
1318 .pred.rel "mutex",p10,p11
1319 { .mfi
1320 nop.m 0
1321 (p10) fma.s1 FR_GAMMA = FR_Rcp2,FR_Rcp3,FR_Rcp2
1322 nop.i 0
1323 }
1324 { .mfi
1325 nop.m 0
1326 (p11) fma.d.s0 f8 = FR_Rcp2,FR_Rcp3,FR_Rcp2
1327 nop.i 0
1328 };;
1329 { .mfb
1330 nop.m 0
1331 (p10) fma.d.s0 f8 = FR_GAMMA,FR_InvAn,f0
1332 br.ret.sptk b0
1333 };;
1334
1335
1336 // here if x >= 3
1337 //--------------------------------------------------------------------
1338 .align 32
1339 tgamma_positives:
1340 .pred.rel "mutex",p8,p9
1341 { .mfi
1342 nop.m 0
1343 (p9) fma.d.s0 f8 = FR_GAMMA,FR_Rq1,f0
1344 nop.i 0
1345 }
1346 { .mfb
1347 nop.m 0
1348 (p8) fma.d.s0 f8 = FR_GAMMA,FR_RqLin,f0
1349 br.ret.sptk b0
1350 };;
1351
1352 // here if 0 < x < 1
1353 //--------------------------------------------------------------------
1354 .align 32
1355 tgamma_from_0_to_2:
1356 { .mfi
1357 getf.exp GR_Sign_Exp = FR_r02
1358 fms.s1 FR_r = FR_r02,f1,FR_Xmin
1359 mov GR_ExpOf025 = 0xFFFD
1360 }
1361 { .mfi
1362 add GR_ad_Co = 0x1200,GR_ad_Data
1363 (p6) fnma.s1 FR_Rcp1 = FR_Rcp0,FR_NormX,f1 // t = 1 - r0*x
1364 (p6) mov GR_Sig2 = 1
1365 };;
1366 { .mfi
1367 (p6) getf.sig GR_Sig = FR_NormX
1368 nop.f 0
1369 (p6) shl GR_Sig2 = GR_Sig2,63
1370 }
1371 { .mfi
1372 add GR_ad_Ce = 0x1210,GR_ad_Data
1373 nop.f 0
1374 (p6) mov GR_NzOvfBound = 0xFBFF
1375 };;
1376 { .mfi
1377 cmp.eq p8,p0 = GR_Sign_Exp,GR_ExpOf05 // r02 >= 1/2
1378 nop.f 0
1379 cmp.eq p9,p10 = GR_Sign_Exp,GR_ExpOf025 // r02 >= 1/4
1380 }
1381 { .mfi
1382 (p6) cmp.ltu.unc p11,p0 = GR_Sign_Exp,GR_NzOvfBound // p11 <- overflow
1383 nop.f 0
1384 (p6) cmp.eq.unc p12,p0 = GR_Sign_Exp,GR_NzOvfBound
1385 };;
1386 .pred.rel "mutex",p8,p9
1387 { .mfi
1388 (p8) add GR_ad_Co = 0x200,GR_ad_Co
1389 (p6) fma.d.s0 FR_X = f1,f1,f8 // set deno & inexact flags
1390 (p9) add GR_ad_Co = 0x100,GR_ad_Co
1391 }
1392 { .mib
1393 (p8) add GR_ad_Ce = 0x200,GR_ad_Ce
1394 (p9) add GR_ad_Ce = 0x100,GR_ad_Ce
1395 (p11) br.cond.spnt tgamma_ovf_near_0 //tgamma_spec_res
1396 };;
1397 { .mfi
1398 ldfe FR_A15 = [GR_ad_Co],32
1399 nop.f 0
1400 (p12) cmp.eq.unc p13,p0 = GR_Sig,GR_Sig2
1401 }
1402 { .mfb
1403 ldfe FR_A14 = [GR_ad_Ce],32
1404 nop.f 0
1405 (p13) br.cond.spnt tgamma_ovf_near_0_boundary //tgamma_spec_res
1406 };;
1407 { .mfi
1408 ldfe FR_A13 = [GR_ad_Co],32
1409 nop.f 0
1410 nop.i 0
1411 }
1412 { .mfi
1413 ldfe FR_A12 = [GR_ad_Ce],32
1414 nop.f 0
1415 nop.i 0
1416 };;
1417 .pred.rel "mutex",p9,p10
1418 { .mfi
1419 ldfe FR_A11 = [GR_ad_Co],32
1420 (p10) fma.s1 FR_r2 = FR_r02,FR_r02,f0
1421 nop.i 0
1422 }
1423 { .mfi
1424 ldfe FR_A10 = [GR_ad_Ce],32
1425 (p9) fma.s1 FR_r2 = FR_r,FR_r,f0
1426 nop.i 0
1427 };;
1428 { .mfi
1429 ldfe FR_A9 = [GR_ad_Co],32
1430 (p6) fma.s1 FR_Rcp1 = FR_Rcp0,FR_Rcp1,FR_Rcp0
1431 nop.i 0
1432 }
1433 { .mfi
1434 ldfe FR_A8 = [GR_ad_Ce],32
1435 (p10) fma.s1 FR_r = f0,f0,FR_r02
1436 nop.i 0
1437 };;
1438 { .mfi
1439 ldfe FR_A7 = [GR_ad_Co],32
1440 nop.f 0
1441 nop.i 0
1442 }
1443 { .mfi
1444 ldfe FR_A6 = [GR_ad_Ce],32
1445 nop.f 0
1446 nop.i 0
1447 };;
1448 { .mfi
1449 ldfe FR_A5 = [GR_ad_Co],32
1450 nop.f 0
1451 nop.i 0
1452 }
1453 { .mfi
1454 ldfe FR_A4 = [GR_ad_Ce],32
1455 nop.f 0
1456 nop.i 0
1457 };;
1458 { .mfi
1459 ldfe FR_A3 = [GR_ad_Co],32
1460 nop.f 0
1461 nop.i 0
1462 }
1463 { .mfi
1464 ldfe FR_A2 = [GR_ad_Ce],32
1465 nop.f 0
1466 nop.i 0
1467 };;
1468 { .mfi
1469 ldfe FR_A1 = [GR_ad_Co],32
1470 fma.s1 FR_r4 = FR_r2,FR_r2,f0
1471 nop.i 0
1472 }
1473 { .mfi
1474 ldfe FR_A0 = [GR_ad_Ce],32
1475 nop.f 0
1476 nop.i 0
1477 };;
1478 { .mfi
1479 nop.m 0
1480 (p6) fnma.s1 FR_Rcp2 = FR_Rcp1,FR_NormX,f1 // t = 1 - r1*x
1481 nop.i 0
1482 };;
1483 { .mfi
1484 nop.m 0
1485 fma.s1 FR_A15 = FR_A15,FR_r,FR_A14
1486 nop.i 0
1487 }
1488 { .mfi
1489 nop.m 0
1490 fma.s1 FR_A11 = FR_A11,FR_r,FR_A10
1491 nop.i 0
1492 };;
1493 { .mfi
1494 nop.m 0
1495 fma.s1 FR_r8 = FR_r4,FR_r4,f0
1496 nop.i 0
1497 };;
1498 { .mfi
1499 nop.m 0
1500 (p6) fma.s1 FR_Rcp2 = FR_Rcp1,FR_Rcp2,FR_Rcp1
1501 nop.i 0
1502 };;
1503 { .mfi
1504 nop.m 0
1505 fma.s1 FR_A7 = FR_A7,FR_r,FR_A6
1506 nop.i 0
1507 }
1508 { .mfi
1509 nop.m 0
1510 fma.s1 FR_A3 = FR_A3,FR_r,FR_A2
1511 nop.i 0
1512 };;
1513 { .mfi
1514 nop.m 0
1515 fma.s1 FR_A15 = FR_A15,FR_r,FR_A13
1516 nop.i 0
1517 }
1518 { .mfi
1519 nop.m 0
1520 fma.s1 FR_A11 = FR_A11,FR_r,FR_A9
1521 nop.i 0
1522 };;
1523 { .mfi
1524 nop.m 0
1525 (p6) fnma.s1 FR_Rcp3 = FR_Rcp2,FR_NormX,f1 // t = 1 - r1*x
1526 nop.i 0
1527 };;
1528 { .mfi
1529 nop.m 0
1530 fma.s1 FR_A7 = FR_A7,FR_r,FR_A5
1531 nop.i 0
1532 }
1533 { .mfi
1534 nop.m 0
1535 fma.s1 FR_A3 = FR_A3,FR_r,FR_A1
1536 nop.i 0
1537 };;
1538 { .mfi
1539 nop.m 0
1540 fma.s1 FR_A15 = FR_A15,FR_r,FR_A12
1541 nop.i 0
1542 }
1543 { .mfi
1544 nop.m 0
1545 fma.s1 FR_A11 = FR_A11,FR_r,FR_A8
1546 nop.i 0
1547 };;
1548 { .mfi
1549 nop.m 0
1550 (p6) fma.s1 FR_Rcp3 = FR_Rcp2,FR_Rcp3,FR_Rcp2
1551 nop.i 0
1552 };;
1553 { .mfi
1554 nop.m 0
1555 fma.s1 FR_A7 = FR_A7,FR_r,FR_A4
1556 nop.i 0
1557 }
1558 { .mfi
1559 nop.m 0
1560 fma.s1 FR_A3 = FR_A3,FR_r,FR_A0
1561 nop.i 0
1562 };;
1563 { .mfi
1564 nop.m 0
1565 fma.s1 FR_A15 = FR_A15,FR_r4,FR_A11
1566 nop.i 0
1567 }
1568 { .mfi
1569 nop.m 0
1570 fma.s1 FR_A7 = FR_A7,FR_r4,FR_A3
1571 nop.i 0
1572 };;
1573 .pred.rel "mutex",p6,p7
1574 { .mfi
1575 nop.m 0
1576 (p6) fma.s1 FR_A15 = FR_A15,FR_r8,FR_A7
1577 nop.i 0
1578 }
1579 { .mfi
1580 nop.m 0
1581 (p7) fma.d.s0 f8 = FR_A15,FR_r8,FR_A7
1582 nop.i 0
1583 };;
1584 { .mfb
1585 nop.m 0
1586 (p6) fma.d.s0 f8 = FR_A15,FR_Rcp3,f0
1587 br.ret.sptk b0
1588 };;
1589
1590 // overflow
1591 //--------------------------------------------------------------------
1592 .align 32
1593 tgamma_ovf_near_0_boundary:
1594 .pred.rel "mutex",p14,p15
1595 { .mfi
1596 mov GR_fpsr = ar.fpsr
1597 nop.f 0
1598 (p15) mov r8 = 0x7ff
1599 }
1600 { .mfi
1601 nop.m 0
1602 nop.f 0
1603 (p14) mov r8 = 0xfff
1604 };;
1605 { .mfi
1606 nop.m 0
1607 nop.f 0
1608 shl r8 = r8,52
1609 };;
1610 { .mfi
1611 sub r8 = r8,r0,1
1612 nop.f 0
1613 extr.u GR_fpsr = GR_fpsr,10,2 // rounding mode
1614 };;
1615 .pred.rel "mutex",p14,p15
1616 { .mfi
1617 // set p8 to 0 in case of overflow and to 1 otherwise
1618 // for negative arg:
1619 // no overflow if rounding mode either Z or +Inf, i.e.
1620 // GR_fpsr > 1
1621 (p14) cmp.lt p8,p0 = 1,GR_fpsr
1622 nop.f 0
1623 // for positive arg:
1624 // no overflow if rounding mode either Z or -Inf, i.e.
1625 // (GR_fpsr & 1) == 0
1626 (p15) tbit.z p0,p8 = GR_fpsr,0
1627 };;
1628 { .mib
1629 (p8) setf.d f8 = r8 // set result to 0x7fefffffffffffff without
1630 // OVERFLOW flag raising
1631 nop.i 0
1632 (p8) br.ret.sptk b0
1633 };;
1634 .align 32
1635 tgamma_ovf_near_0:
1636 { .mfi
1637 mov r8 = 0x1FFFE
1638 nop.f 0
1639 nop.i 0
1640 };;
1641 { .mfi
1642 setf.exp f9 = r8
1643 fmerge.s FR_X = f8,f8
1644 mov GR_TAG = 258 // overflow
1645 };;
1646 .pred.rel "mutex",p14,p15
1647 { .mfi
1648 nop.m 0
1649 (p15) fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
1650 nop.i 0
1651 }
1652 { .mfb
1653 nop.m 0
1654 (p14) fnma.d.s0 f8 = f9,f9,f0 // Set I,O and -INF result
1655 br.cond.sptk tgamma_libm_err
1656 };;
1657 // overflow or absolute value of x is too big
1658 //--------------------------------------------------------------------
1659 .align 32
1660 tgamma_spec_res:
1661 { .mfi
1662 mov GR_0x30033 = 0x30033
1663 (p14) fcmp.eq.unc.s1 p10,p11 = f8,FR_Xt
1664 (p15) mov r8 = 0x1FFFE
1665 };;
1666 { .mfi
1667 (p15) setf.exp f9 = r8
1668 nop.f 0
1669 nop.i 0
1670 };;
1671 { .mfb
1672 (p11) cmp.ltu.unc p7,p8 = GR_0x30033,GR_Sign_Exp
1673 nop.f 0
1674 (p10) br.cond.spnt tgamma_singularity
1675 };;
1676 .pred.rel "mutex",p7,p8
1677 { .mbb
1678 nop.m 0
1679 (p7) br.cond.spnt tgamma_singularity
1680 (p8) br.cond.spnt tgamma_underflow
1681 };;
1682 { .mfi
1683 nop.m 0
1684 fmerge.s FR_X = f8,f8
1685 mov GR_TAG = 258 // overflow
1686 }
1687 { .mfb
1688 nop.m 0
1689 (p15) fma.d.s0 f8 = f9,f9,f0 // Set I,O and +INF result
1690 br.cond.sptk tgamma_libm_err
1691 };;
1692
1693 // x is negative integer or +/-0
1694 //--------------------------------------------------------------------
1695 .align 32
1696 tgamma_singularity:
1697 { .mfi
1698 nop.m 0
1699 fmerge.s FR_X = f8,f8
1700 mov GR_TAG = 259 // negative
1701 }
1702 { .mfb
1703 nop.m 0
1704 frcpa.s0 f8,p0 = f0,f0
1705 br.cond.sptk tgamma_libm_err
1706 };;
1707 // x is negative noninteger with big absolute value
1708 //--------------------------------------------------------------------
1709 .align 32
1710 tgamma_underflow:
1711 { .mmi
1712 getf.sig GR_Sig = FR_iXt
1713 mov r11 = 0x00001
1714 nop.i 0
1715 };;
1716 { .mfi
1717 setf.exp f9 = r11
1718 nop.f 0
1719 nop.i 0
1720 };;
1721 { .mfi
1722 nop.m 0
1723 nop.f 0
1724 tbit.z p6,p7 = GR_Sig,0
1725 };;
1726 .pred.rel "mutex",p6,p7
1727 { .mfi
1728 nop.m 0
1729 (p6) fms.d.s0 f8 = f9,f9,f9
1730 nop.i 0
1731 }
1732 { .mfb
1733 nop.m 0
1734 (p7) fma.d.s0 f8 = f9,f9,f9
1735 br.ret.sptk b0
1736 };;
1737
1738 // x for natval, nan, +/-inf or +/-0
1739 //--------------------------------------------------------------------
1740 .align 32
1741 tgamma_spec:
1742 { .mfi
1743 nop.m 0
1744 fclass.m p6,p0 = f8,0x1E1 // Test x for natval, nan, +inf
1745 nop.i 0
1746 };;
1747 { .mfi
1748 nop.m 0
1749 fclass.m p7,p8 = f8,0x7 // +/-0
1750 nop.i 0
1751 };;
1752 { .mfi
1753 nop.m 0
1754 fmerge.s FR_X = f8,f8
1755 nop.i 0
1756 }
1757 { .mfb
1758 nop.m 0
1759 (p6) fma.d.s0 f8 = f8,f1,f8
1760 (p6) br.ret.spnt b0
1761 };;
1762 .pred.rel "mutex",p7,p8
1763 { .mfi
1764 (p7) mov GR_TAG = 259 // negative
1765 (p7) frcpa.s0 f8,p0 = f1,f8
1766 nop.i 0
1767 }
1768 { .mib
1769 nop.m 0
1770 nop.i 0
1771 (p8) br.cond.spnt tgamma_singularity
1772 };;
1773
1774 .align 32
1775 tgamma_libm_err:
1776 { .mfi
1777 alloc r32 = ar.pfs,1,4,4,0
1778 nop.f 0
1779 mov GR_Parameter_TAG = GR_TAG
1780 };;
1781
1782 GLOBAL_LIBM_END(tgamma)
1783 libm_alias_double_other (tgamma, tgamma)
1784
1785 LOCAL_LIBM_ENTRY(__libm_error_region)
1786 .prologue
1787 { .mfi
1788 add GR_Parameter_Y=-32,sp // Parameter 2 value
1789 nop.f 0
1790 .save ar.pfs,GR_SAVE_PFS
1791 mov GR_SAVE_PFS=ar.pfs // Save ar.pfs
1792 }
1793 { .mfi
1794 .fframe 64
1795 add sp=-64,sp // Create new stack
1796 nop.f 0
1797 mov GR_SAVE_GP=gp // Save gp
1798 };;
1799 { .mmi
1800 stfd [GR_Parameter_Y] = FR_Y,16 // STORE Parameter 2 on stack
1801 add GR_Parameter_X = 16,sp // Parameter 1 address
1802 .save b0, GR_SAVE_B0
1803 mov GR_SAVE_B0=b0 // Save b0
1804 };;
1805 .body
1806 { .mib
1807 stfd [GR_Parameter_X] = FR_X // STORE Parameter 1 on stack
1808 add GR_Parameter_RESULT = 0,GR_Parameter_Y // Parameter 3 address
1809 nop.b 0
1810 }
1811 { .mib
1812 stfd [GR_Parameter_Y] = FR_RESULT // STORE Parameter 3 on stack
1813 add GR_Parameter_Y = -16,GR_Parameter_Y
1814 br.call.sptk b0=__libm_error_support# // Call error handling function
1815 };;
1816 { .mmi
1817 nop.m 0
1818 nop.m 0
1819 add GR_Parameter_RESULT = 48,sp
1820 };;
1821 { .mmi
1822 ldfd f8 = [GR_Parameter_RESULT] // Get return result off stack
1823 .restore sp
1824 add sp = 64,sp // Restore stack pointer
1825 mov b0 = GR_SAVE_B0 // Restore return address
1826 };;
1827 { .mib
1828 mov gp = GR_SAVE_GP // Restore gp
1829 mov ar.pfs = GR_SAVE_PFS // Restore ar.pfs
1830 br.ret.sptk b0 // Return
1831 };;
1832
1833 LOCAL_LIBM_END(__libm_error_region)
1834 .type __libm_error_support#,@function
1835 .global __libm_error_support#