[sim] added SoftFloat-3 source
[riscv-isa-sim.git] / softfloat / SoftFloat-3 / source / OLD-softfloat.c
1
2 /*============================================================================
3
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30
31 =============================================================================*/
32
33 #include "milieu.h"
34 #include "softfloat.h"
35
36 /*----------------------------------------------------------------------------
37 | Primitive arithmetic functions, including multi-word arithmetic, and
38 | division and square root approximations. (Can be specialized to target if
39 | desired.)
40 *----------------------------------------------------------------------------*/
41 #include "softfloat-macros"
42
43 /*----------------------------------------------------------------------------
44 | Functions and definitions to determine: (1) whether tininess for underflow
45 | is detected before or after rounding by default, (2) what (if anything)
46 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
47 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
48 | are propagated from function inputs to output. These details are target-
49 | specific.
50 *----------------------------------------------------------------------------*/
51 #include "softfloat-specialize"
52
53 #ifdef FLOATX80
54
55 /*----------------------------------------------------------------------------
56 | Returns the fraction bits of the extended double-precision floating-point
57 | value `a'.
58 *----------------------------------------------------------------------------*/
59
60 INLINE bits64 extractFloatx80Frac( floatx80 a )
61 {
62
63 return a.low;
64
65 }
66
67 /*----------------------------------------------------------------------------
68 | Returns the exponent bits of the extended double-precision floating-point
69 | value `a'.
70 *----------------------------------------------------------------------------*/
71
72 INLINE int32 extractFloatx80Exp( floatx80 a )
73 {
74
75 return a.high & 0x7FFF;
76
77 }
78
79 /*----------------------------------------------------------------------------
80 | Returns the sign bit of the extended double-precision floating-point value
81 | `a'.
82 *----------------------------------------------------------------------------*/
83
84 INLINE flag extractFloatx80Sign( floatx80 a )
85 {
86
87 return a.high>>15;
88
89 }
90
91 /*----------------------------------------------------------------------------
92 | Normalizes the subnormal extended double-precision floating-point value
93 | represented by the denormalized significand `aSig'. The normalized exponent
94 | and significand are stored at the locations pointed to by `zExpPtr' and
95 | `zSigPtr', respectively.
96 *----------------------------------------------------------------------------*/
97
98 static void
99 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
100 {
101 int8 shiftCount;
102
103 shiftCount = countLeadingZeros64( aSig );
104 *zSigPtr = aSig<<shiftCount;
105 *zExpPtr = 1 - shiftCount;
106
107 }
108
109 /*----------------------------------------------------------------------------
110 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
111 | extended double-precision floating-point value, returning the result.
112 *----------------------------------------------------------------------------*/
113
114 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
115 {
116 floatx80 z;
117
118 z.low = zSig;
119 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
120 return z;
121
122 }
123
124 /*----------------------------------------------------------------------------
125 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
126 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
127 | and returns the proper extended double-precision floating-point value
128 | corresponding to the abstract input. Ordinarily, the abstract value is
129 | rounded and packed into the extended double-precision format, with the
130 | inexact exception raised if the abstract input cannot be represented
131 | exactly. However, if the abstract value is too large, the overflow and
132 | inexact exceptions are raised and an infinity or maximal finite value is
133 | returned. If the abstract value is too small, the input value is rounded to
134 | a subnormal number, and the underflow and inexact exceptions are raised if
135 | the abstract input cannot be represented exactly as a subnormal extended
136 | double-precision floating-point number.
137 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
138 | number of bits as single or double precision, respectively. Otherwise, the
139 | result is rounded to the full precision of the extended double-precision
140 | format.
141 | The input significand must be normalized or smaller. If the input
142 | significand is not normalized, `zExp' must be 0; in that case, the result
143 | returned is a subnormal number, and it must not require rounding. The
144 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
145 | Floating-Point Arithmetic.
146 *----------------------------------------------------------------------------*/
147
148 static floatx80
149 roundAndPackFloatx80(
150 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
151 )
152 {
153 int8 roundingMode;
154 flag roundNearestEven, increment, isTiny;
155 int64 roundIncrement, roundMask, roundBits;
156
157 roundingMode = float_rounding_mode;
158 roundNearestEven = ( roundingMode == float_round_nearest_even );
159 if ( roundingPrecision == 80 ) goto precision80;
160 if ( roundingPrecision == 64 ) {
161 roundIncrement = LIT64( 0x0000000000000400 );
162 roundMask = LIT64( 0x00000000000007FF );
163 }
164 else if ( roundingPrecision == 32 ) {
165 roundIncrement = LIT64( 0x0000008000000000 );
166 roundMask = LIT64( 0x000000FFFFFFFFFF );
167 }
168 else {
169 goto precision80;
170 }
171 zSig0 |= ( zSig1 != 0 );
172 if ( ! roundNearestEven ) {
173 if ( roundingMode == float_round_to_zero ) {
174 roundIncrement = 0;
175 }
176 else {
177 roundIncrement = roundMask;
178 if ( zSign ) {
179 if ( roundingMode == float_round_up ) roundIncrement = 0;
180 }
181 else {
182 if ( roundingMode == float_round_down ) roundIncrement = 0;
183 }
184 }
185 }
186 roundBits = zSig0 & roundMask;
187 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
188 if ( ( 0x7FFE < zExp )
189 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
190 ) {
191 goto overflow;
192 }
193 if ( zExp <= 0 ) {
194 isTiny =
195 ( float_detect_tininess == float_tininess_before_rounding )
196 || ( zExp < 0 )
197 || ( zSig0 <= zSig0 + roundIncrement );
198 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
199 zExp = 0;
200 roundBits = zSig0 & roundMask;
201 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
202 if ( roundBits ) float_exception_flags |= float_flag_inexact;
203 zSig0 += roundIncrement;
204 if ( (sbits64) zSig0 < 0 ) zExp = 1;
205 roundIncrement = roundMask + 1;
206 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
207 roundMask |= roundIncrement;
208 }
209 zSig0 &= ~ roundMask;
210 return packFloatx80( zSign, zExp, zSig0 );
211 }
212 }
213 if ( roundBits ) float_exception_flags |= float_flag_inexact;
214 zSig0 += roundIncrement;
215 if ( zSig0 < roundIncrement ) {
216 ++zExp;
217 zSig0 = LIT64( 0x8000000000000000 );
218 }
219 roundIncrement = roundMask + 1;
220 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
221 roundMask |= roundIncrement;
222 }
223 zSig0 &= ~ roundMask;
224 if ( zSig0 == 0 ) zExp = 0;
225 return packFloatx80( zSign, zExp, zSig0 );
226 precision80:
227 increment = ( (sbits64) zSig1 < 0 );
228 if ( ! roundNearestEven ) {
229 if ( roundingMode == float_round_to_zero ) {
230 increment = 0;
231 }
232 else {
233 if ( zSign ) {
234 increment = ( roundingMode == float_round_down ) && zSig1;
235 }
236 else {
237 increment = ( roundingMode == float_round_up ) && zSig1;
238 }
239 }
240 }
241 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
242 if ( ( 0x7FFE < zExp )
243 || ( ( zExp == 0x7FFE )
244 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
245 && increment
246 )
247 ) {
248 roundMask = 0;
249 overflow:
250 float_raise( float_flag_overflow | float_flag_inexact );
251 if ( ( roundingMode == float_round_to_zero )
252 || ( zSign && ( roundingMode == float_round_up ) )
253 || ( ! zSign && ( roundingMode == float_round_down ) )
254 ) {
255 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
256 }
257 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
258 }
259 if ( zExp <= 0 ) {
260 isTiny =
261 ( float_detect_tininess == float_tininess_before_rounding )
262 || ( zExp < 0 )
263 || ! increment
264 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
265 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
266 zExp = 0;
267 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
268 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
269 if ( roundNearestEven ) {
270 increment = ( (sbits64) zSig1 < 0 );
271 }
272 else {
273 if ( zSign ) {
274 increment = ( roundingMode == float_round_down ) && zSig1;
275 }
276 else {
277 increment = ( roundingMode == float_round_up ) && zSig1;
278 }
279 }
280 if ( increment ) {
281 ++zSig0;
282 zSig0 &=
283 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
284 if ( (sbits64) zSig0 < 0 ) zExp = 1;
285 }
286 return packFloatx80( zSign, zExp, zSig0 );
287 }
288 }
289 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
290 if ( increment ) {
291 ++zSig0;
292 if ( zSig0 == 0 ) {
293 ++zExp;
294 zSig0 = LIT64( 0x8000000000000000 );
295 }
296 else {
297 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
298 }
299 }
300 else {
301 if ( zSig0 == 0 ) zExp = 0;
302 }
303 return packFloatx80( zSign, zExp, zSig0 );
304
305 }
306
307 /*----------------------------------------------------------------------------
308 | Takes an abstract floating-point value having sign `zSign', exponent
309 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
310 | and returns the proper extended double-precision floating-point value
311 | corresponding to the abstract input. This routine is just like
312 | `roundAndPackFloatx80' except that the input significand does not have to be
313 | normalized.
314 *----------------------------------------------------------------------------*/
315
316 static floatx80
317 normalizeRoundAndPackFloatx80(
318 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
319 )
320 {
321 int8 shiftCount;
322
323 if ( zSig0 == 0 ) {
324 zSig0 = zSig1;
325 zSig1 = 0;
326 zExp -= 64;
327 }
328 shiftCount = countLeadingZeros64( zSig0 );
329 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
330 zExp -= shiftCount;
331 return
332 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
333
334 }
335
336 #endif
337
338 #ifdef FLOAT128
339
340 /*----------------------------------------------------------------------------
341 | Returns the least-significant 64 fraction bits of the quadruple-precision
342 | floating-point value `a'.
343 *----------------------------------------------------------------------------*/
344
345 INLINE bits64 extractFloat128Frac1( float128 a )
346 {
347
348 return a.low;
349
350 }
351
352 /*----------------------------------------------------------------------------
353 | Returns the most-significant 48 fraction bits of the quadruple-precision
354 | floating-point value `a'.
355 *----------------------------------------------------------------------------*/
356
357 INLINE bits64 extractFloat128Frac0( float128 a )
358 {
359
360 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
361
362 }
363
364 /*----------------------------------------------------------------------------
365 | Returns the exponent bits of the quadruple-precision floating-point value
366 | `a'.
367 *----------------------------------------------------------------------------*/
368
369 INLINE int32 extractFloat128Exp( float128 a )
370 {
371
372 return ( a.high>>48 ) & 0x7FFF;
373
374 }
375
376 /*----------------------------------------------------------------------------
377 | Returns the sign bit of the quadruple-precision floating-point value `a'.
378 *----------------------------------------------------------------------------*/
379
380 INLINE flag extractFloat128Sign( float128 a )
381 {
382
383 return a.high>>63;
384
385 }
386
387 /*----------------------------------------------------------------------------
388 | Normalizes the subnormal quadruple-precision floating-point value
389 | represented by the denormalized significand formed by the concatenation of
390 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
391 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
392 | significand are stored at the location pointed to by `zSig0Ptr', and the
393 | least significant 64 bits of the normalized significand are stored at the
394 | location pointed to by `zSig1Ptr'.
395 *----------------------------------------------------------------------------*/
396
397 static void
398 normalizeFloat128Subnormal(
399 bits64 aSig0,
400 bits64 aSig1,
401 int32 *zExpPtr,
402 bits64 *zSig0Ptr,
403 bits64 *zSig1Ptr
404 )
405 {
406 int8 shiftCount;
407
408 if ( aSig0 == 0 ) {
409 shiftCount = countLeadingZeros64( aSig1 ) - 15;
410 if ( shiftCount < 0 ) {
411 *zSig0Ptr = aSig1>>( - shiftCount );
412 *zSig1Ptr = aSig1<<( shiftCount & 63 );
413 }
414 else {
415 *zSig0Ptr = aSig1<<shiftCount;
416 *zSig1Ptr = 0;
417 }
418 *zExpPtr = - shiftCount - 63;
419 }
420 else {
421 shiftCount = countLeadingZeros64( aSig0 ) - 15;
422 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
423 *zExpPtr = 1 - shiftCount;
424 }
425
426 }
427
428 /*----------------------------------------------------------------------------
429 | Packs the sign `zSign', the exponent `zExp', and the significand formed
430 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
431 | floating-point value, returning the result. After being shifted into the
432 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
433 | added together to form the most significant 32 bits of the result. This
434 | means that any integer portion of `zSig0' will be added into the exponent.
435 | Since a properly normalized significand will have an integer portion equal
436 | to 1, the `zExp' input should be 1 less than the desired result exponent
437 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
438 | significand.
439 *----------------------------------------------------------------------------*/
440
441 INLINE float128
442 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
443 {
444 float128 z;
445
446 z.low = zSig1;
447 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
448 return z;
449
450 }
451
452 /*----------------------------------------------------------------------------
453 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454 | and extended significand formed by the concatenation of `zSig0', `zSig1',
455 | and `zSig2', and returns the proper quadruple-precision floating-point value
456 | corresponding to the abstract input. Ordinarily, the abstract value is
457 | simply rounded and packed into the quadruple-precision format, with the
458 | inexact exception raised if the abstract input cannot be represented
459 | exactly. However, if the abstract value is too large, the overflow and
460 | inexact exceptions are raised and an infinity or maximal finite value is
461 | returned. If the abstract value is too small, the input value is rounded to
462 | a subnormal number, and the underflow and inexact exceptions are raised if
463 | the abstract input cannot be represented exactly as a subnormal quadruple-
464 | precision floating-point number.
465 | The input significand must be normalized or smaller. If the input
466 | significand is not normalized, `zExp' must be 0; in that case, the result
467 | returned is a subnormal number, and it must not require rounding. In the
468 | usual case that the input significand is normalized, `zExp' must be 1 less
469 | than the ``true'' floating-point exponent. The handling of underflow and
470 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
471 *----------------------------------------------------------------------------*/
472
473 static float128
474 roundAndPackFloat128(
475 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
476 {
477 int8 roundingMode;
478 flag roundNearestEven, increment, isTiny;
479
480 roundingMode = float_rounding_mode;
481 roundNearestEven = ( roundingMode == float_round_nearest_even );
482 increment = ( (sbits64) zSig2 < 0 );
483 if ( ! roundNearestEven ) {
484 if ( roundingMode == float_round_to_zero ) {
485 increment = 0;
486 }
487 else {
488 if ( zSign ) {
489 increment = ( roundingMode == float_round_down ) && zSig2;
490 }
491 else {
492 increment = ( roundingMode == float_round_up ) && zSig2;
493 }
494 }
495 }
496 if ( 0x7FFD <= (bits32) zExp ) {
497 if ( ( 0x7FFD < zExp )
498 || ( ( zExp == 0x7FFD )
499 && eq128(
500 LIT64( 0x0001FFFFFFFFFFFF ),
501 LIT64( 0xFFFFFFFFFFFFFFFF ),
502 zSig0,
503 zSig1
504 )
505 && increment
506 )
507 ) {
508 float_raise( float_flag_overflow | float_flag_inexact );
509 if ( ( roundingMode == float_round_to_zero )
510 || ( zSign && ( roundingMode == float_round_up ) )
511 || ( ! zSign && ( roundingMode == float_round_down ) )
512 ) {
513 return
514 packFloat128(
515 zSign,
516 0x7FFE,
517 LIT64( 0x0000FFFFFFFFFFFF ),
518 LIT64( 0xFFFFFFFFFFFFFFFF )
519 );
520 }
521 return packFloat128( zSign, 0x7FFF, 0, 0 );
522 }
523 if ( zExp < 0 ) {
524 isTiny =
525 ( float_detect_tininess == float_tininess_before_rounding )
526 || ( zExp < -1 )
527 || ! increment
528 || lt128(
529 zSig0,
530 zSig1,
531 LIT64( 0x0001FFFFFFFFFFFF ),
532 LIT64( 0xFFFFFFFFFFFFFFFF )
533 );
534 shift128ExtraRightJamming(
535 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
536 zExp = 0;
537 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
538 if ( roundNearestEven ) {
539 increment = ( (sbits64) zSig2 < 0 );
540 }
541 else {
542 if ( zSign ) {
543 increment = ( roundingMode == float_round_down ) && zSig2;
544 }
545 else {
546 increment = ( roundingMode == float_round_up ) && zSig2;
547 }
548 }
549 }
550 }
551 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
552 if ( increment ) {
553 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
554 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
555 }
556 else {
557 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
558 }
559 return packFloat128( zSign, zExp, zSig0, zSig1 );
560
561 }
562
563 /*----------------------------------------------------------------------------
564 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
565 | and significand formed by the concatenation of `zSig0' and `zSig1', and
566 | returns the proper quadruple-precision floating-point value corresponding
567 | to the abstract input. This routine is just like `roundAndPackFloat128'
568 | except that the input significand has fewer bits and does not have to be
569 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
570 | point exponent.
571 *----------------------------------------------------------------------------*/
572
573 static float128
574 normalizeRoundAndPackFloat128(
575 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
576 {
577 int8 shiftCount;
578 bits64 zSig2;
579
580 if ( zSig0 == 0 ) {
581 zSig0 = zSig1;
582 zSig1 = 0;
583 zExp -= 64;
584 }
585 shiftCount = countLeadingZeros64( zSig0 ) - 15;
586 if ( 0 <= shiftCount ) {
587 zSig2 = 0;
588 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
589 }
590 else {
591 shift128ExtraRightJamming(
592 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
593 }
594 zExp -= shiftCount;
595 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
596
597 }
598
599 #endif
600
601 #ifdef FLOATX80
602
603 /*----------------------------------------------------------------------------
604 | Returns the result of converting the 32-bit two's complement integer `a'
605 | to the extended double-precision floating-point format. The conversion
606 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
607 | Arithmetic.
608 *----------------------------------------------------------------------------*/
609
610 floatx80 int32_to_floatx80( int32 a )
611 {
612 flag zSign;
613 uint32 absA;
614 int8 shiftCount;
615 bits64 zSig;
616
617 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
618 zSign = ( a < 0 );
619 absA = zSign ? - a : a;
620 shiftCount = countLeadingZeros32( absA ) + 32;
621 zSig = absA;
622 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
623
624 }
625
626 #endif
627
628 #ifdef FLOAT128
629
630 /*----------------------------------------------------------------------------
631 | Returns the result of converting the 32-bit two's complement integer `a' to
632 | the quadruple-precision floating-point format. The conversion is performed
633 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
634 *----------------------------------------------------------------------------*/
635
636 float128 int32_to_float128( int32 a )
637 {
638 flag zSign;
639 uint32 absA;
640 int8 shiftCount;
641 bits64 zSig0;
642
643 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
644 zSign = ( a < 0 );
645 absA = zSign ? - a : a;
646 shiftCount = countLeadingZeros32( absA ) + 17;
647 zSig0 = absA;
648 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
649
650 }
651
652 #endif
653
654 #ifdef FLOATX80
655
656 /*----------------------------------------------------------------------------
657 | Returns the result of converting the 64-bit two's complement integer `a'
658 | to the extended double-precision floating-point format. The conversion
659 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
660 | Arithmetic.
661 *----------------------------------------------------------------------------*/
662
663 floatx80 int64_to_floatx80( int64 a )
664 {
665 flag zSign;
666 uint64 absA;
667 int8 shiftCount;
668
669 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
670 zSign = ( a < 0 );
671 absA = zSign ? - a : a;
672 shiftCount = countLeadingZeros64( absA );
673 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
674
675 }
676
677 #endif
678
679 #ifdef FLOAT128
680
681 /*----------------------------------------------------------------------------
682 | Returns the result of converting the 64-bit two's complement integer `a' to
683 | the quadruple-precision floating-point format. The conversion is performed
684 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
685 *----------------------------------------------------------------------------*/
686
687 float128 int64_to_float128( int64 a )
688 {
689 flag zSign;
690 uint64 absA;
691 int8 shiftCount;
692 int32 zExp;
693 bits64 zSig0, zSig1;
694
695 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
696 zSign = ( a < 0 );
697 absA = zSign ? - a : a;
698 shiftCount = countLeadingZeros64( absA ) + 49;
699 zExp = 0x406E - shiftCount;
700 if ( 64 <= shiftCount ) {
701 zSig1 = 0;
702 zSig0 = absA;
703 shiftCount -= 64;
704 }
705 else {
706 zSig1 = absA;
707 zSig0 = 0;
708 }
709 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
710 return packFloat128( zSign, zExp, zSig0, zSig1 );
711
712 }
713
714 #endif
715
716 #ifdef FLOATX80
717
718 /*----------------------------------------------------------------------------
719 | Returns the result of converting the single-precision floating-point value
720 | `a' to the extended double-precision floating-point format. The conversion
721 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
722 | Arithmetic.
723 *----------------------------------------------------------------------------*/
724
725 floatx80 float32_to_floatx80( float32 a )
726 {
727 flag aSign;
728 int16 aExp;
729 bits32 aSig;
730
731 aSig = extractFloat32Frac( a );
732 aExp = extractFloat32Exp( a );
733 aSign = extractFloat32Sign( a );
734 if ( aExp == 0xFF ) {
735 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
736 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
737 }
738 if ( aExp == 0 ) {
739 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
740 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
741 }
742 aSig |= 0x00800000;
743 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
744
745 }
746
747 #endif
748
749 #ifdef FLOAT128
750
751 /*----------------------------------------------------------------------------
752 | Returns the result of converting the single-precision floating-point value
753 | `a' to the double-precision floating-point format. The conversion is
754 | performed according to the IEC/IEEE Standard for Binary Floating-Point
755 | Arithmetic.
756 *----------------------------------------------------------------------------*/
757
758 float128 float32_to_float128( float32 a )
759 {
760 flag aSign;
761 int16 aExp;
762 bits32 aSig;
763
764 aSig = extractFloat32Frac( a );
765 aExp = extractFloat32Exp( a );
766 aSign = extractFloat32Sign( a );
767 if ( aExp == 0xFF ) {
768 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
769 return packFloat128( aSign, 0x7FFF, 0, 0 );
770 }
771 if ( aExp == 0 ) {
772 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
773 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
774 --aExp;
775 }
776 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
777
778 }
779
780 #endif
781
782 #ifdef FLOATX80
783
784 /*----------------------------------------------------------------------------
785 | Returns the result of converting the double-precision floating-point value
786 | `a' to the extended double-precision floating-point format. The conversion
787 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
788 | Arithmetic.
789 *----------------------------------------------------------------------------*/
790
791 floatx80 float64_to_floatx80( float64 a )
792 {
793 flag aSign;
794 int16 aExp;
795 bits64 aSig;
796
797 aSig = extractFloat64Frac( a );
798 aExp = extractFloat64Exp( a );
799 aSign = extractFloat64Sign( a );
800 if ( aExp == 0x7FF ) {
801 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
802 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
803 }
804 if ( aExp == 0 ) {
805 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
806 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
807 }
808 return
809 packFloatx80(
810 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
811
812 }
813
814 #endif
815
816 #ifdef FLOAT128
817
818 /*----------------------------------------------------------------------------
819 | Returns the result of converting the double-precision floating-point value
820 | `a' to the quadruple-precision floating-point format. The conversion is
821 | performed according to the IEC/IEEE Standard for Binary Floating-Point
822 | Arithmetic.
823 *----------------------------------------------------------------------------*/
824
825 float128 float64_to_float128( float64 a )
826 {
827 flag aSign;
828 int16 aExp;
829 bits64 aSig, zSig0, zSig1;
830
831 aSig = extractFloat64Frac( a );
832 aExp = extractFloat64Exp( a );
833 aSign = extractFloat64Sign( a );
834 if ( aExp == 0x7FF ) {
835 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
836 return packFloat128( aSign, 0x7FFF, 0, 0 );
837 }
838 if ( aExp == 0 ) {
839 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
840 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
841 --aExp;
842 }
843 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
844 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
845
846 }
847
848 #endif
849
850 #ifdef FLOATX80
851
852 /*----------------------------------------------------------------------------
853 | Returns the result of converting the extended double-precision floating-
854 | point value `a' to the 32-bit two's complement integer format. The
855 | conversion is performed according to the IEC/IEEE Standard for Binary
856 | Floating-Point Arithmetic---which means in particular that the conversion
857 | is rounded according to the current rounding mode. If `a' is a NaN, the
858 | largest positive integer is returned. Otherwise, if the conversion
859 | overflows, the largest integer with the same sign as `a' is returned.
860 *----------------------------------------------------------------------------*/
861
862 int32 floatx80_to_int32( floatx80 a )
863 {
864 flag aSign;
865 int32 aExp, shiftCount;
866 bits64 aSig;
867
868 aSig = extractFloatx80Frac( a );
869 aExp = extractFloatx80Exp( a );
870 aSign = extractFloatx80Sign( a );
871 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
872 shiftCount = 0x4037 - aExp;
873 if ( shiftCount <= 0 ) shiftCount = 1;
874 shift64RightJamming( aSig, shiftCount, &aSig );
875 return roundAndPackInt32( aSign, aSig );
876
877 }
878
879 /*----------------------------------------------------------------------------
880 | Returns the result of converting the extended double-precision floating-
881 | point value `a' to the 32-bit two's complement integer format. The
882 | conversion is performed according to the IEC/IEEE Standard for Binary
883 | Floating-Point Arithmetic, except that the conversion is always rounded
884 | toward zero. If `a' is a NaN, the largest positive integer is returned.
885 | Otherwise, if the conversion overflows, the largest integer with the same
886 | sign as `a' is returned.
887 *----------------------------------------------------------------------------*/
888
889 int32 floatx80_to_int32_round_to_zero( floatx80 a )
890 {
891 flag aSign;
892 int32 aExp, shiftCount;
893 bits64 aSig, savedASig;
894 int32 z;
895
896 aSig = extractFloatx80Frac( a );
897 aExp = extractFloatx80Exp( a );
898 aSign = extractFloatx80Sign( a );
899 if ( 0x401E < aExp ) {
900 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
901 goto invalid;
902 }
903 else if ( aExp < 0x3FFF ) {
904 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
905 return 0;
906 }
907 shiftCount = 0x403E - aExp;
908 savedASig = aSig;
909 aSig >>= shiftCount;
910 z = aSig;
911 if ( aSign ) z = - z;
912 if ( ( z < 0 ) ^ aSign ) {
913 invalid:
914 float_raise( float_flag_invalid );
915 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
916 }
917 if ( ( aSig<<shiftCount ) != savedASig ) {
918 float_exception_flags |= float_flag_inexact;
919 }
920 return z;
921
922 }
923
924 /*----------------------------------------------------------------------------
925 | Returns the result of converting the extended double-precision floating-
926 | point value `a' to the 64-bit two's complement integer format. The
927 | conversion is performed according to the IEC/IEEE Standard for Binary
928 | Floating-Point Arithmetic---which means in particular that the conversion
929 | is rounded according to the current rounding mode. If `a' is a NaN,
930 | the largest positive integer is returned. Otherwise, if the conversion
931 | overflows, the largest integer with the same sign as `a' is returned.
932 *----------------------------------------------------------------------------*/
933
934 int64 floatx80_to_int64( floatx80 a )
935 {
936 flag aSign;
937 int32 aExp, shiftCount;
938 bits64 aSig, aSigExtra;
939
940 aSig = extractFloatx80Frac( a );
941 aExp = extractFloatx80Exp( a );
942 aSign = extractFloatx80Sign( a );
943 shiftCount = 0x403E - aExp;
944 if ( shiftCount <= 0 ) {
945 if ( shiftCount ) {
946 float_raise( float_flag_invalid );
947 if ( ! aSign
948 || ( ( aExp == 0x7FFF )
949 && ( aSig != LIT64( 0x8000000000000000 ) ) )
950 ) {
951 return LIT64( 0x7FFFFFFFFFFFFFFF );
952 }
953 return (sbits64) LIT64( 0x8000000000000000 );
954 }
955 aSigExtra = 0;
956 }
957 else {
958 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
959 }
960 return roundAndPackInt64( aSign, aSig, aSigExtra );
961
962 }
963
964 /*----------------------------------------------------------------------------
965 | Returns the result of converting the extended double-precision floating-
966 | point value `a' to the 64-bit two's complement integer format. The
967 | conversion is performed according to the IEC/IEEE Standard for Binary
968 | Floating-Point Arithmetic, except that the conversion is always rounded
969 | toward zero. If `a' is a NaN, the largest positive integer is returned.
970 | Otherwise, if the conversion overflows, the largest integer with the same
971 | sign as `a' is returned.
972 *----------------------------------------------------------------------------*/
973
974 int64 floatx80_to_int64_round_to_zero( floatx80 a )
975 {
976 flag aSign;
977 int32 aExp, shiftCount;
978 bits64 aSig;
979 int64 z;
980
981 aSig = extractFloatx80Frac( a );
982 aExp = extractFloatx80Exp( a );
983 aSign = extractFloatx80Sign( a );
984 shiftCount = aExp - 0x403E;
985 if ( 0 <= shiftCount ) {
986 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
987 if ( ( a.high != 0xC03E ) || aSig ) {
988 float_raise( float_flag_invalid );
989 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
990 return LIT64( 0x7FFFFFFFFFFFFFFF );
991 }
992 }
993 return (sbits64) LIT64( 0x8000000000000000 );
994 }
995 else if ( aExp < 0x3FFF ) {
996 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
997 return 0;
998 }
999 z = aSig>>( - shiftCount );
1000 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
1001 float_exception_flags |= float_flag_inexact;
1002 }
1003 if ( aSign ) z = - z;
1004 return z;
1005
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 | Returns the result of converting the extended double-precision floating-
1010 | point value `a' to the single-precision floating-point format. The
1011 | conversion is performed according to the IEC/IEEE Standard for Binary
1012 | Floating-Point Arithmetic.
1013 *----------------------------------------------------------------------------*/
1014
1015 float32 floatx80_to_float32( floatx80 a )
1016 {
1017 flag aSign;
1018 int32 aExp;
1019 bits64 aSig;
1020
1021 aSig = extractFloatx80Frac( a );
1022 aExp = extractFloatx80Exp( a );
1023 aSign = extractFloatx80Sign( a );
1024 if ( aExp == 0x7FFF ) {
1025 if ( (bits64) ( aSig<<1 ) ) {
1026 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
1027 }
1028 return packFloat32( aSign, 0xFF, 0 );
1029 }
1030 shift64RightJamming( aSig, 33, &aSig );
1031 if ( aExp || aSig ) aExp -= 0x3F81;
1032 return roundAndPackFloat32( aSign, aExp, aSig );
1033
1034 }
1035
1036 /*----------------------------------------------------------------------------
1037 | Returns the result of converting the extended double-precision floating-
1038 | point value `a' to the double-precision floating-point format. The
1039 | conversion is performed according to the IEC/IEEE Standard for Binary
1040 | Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1042
1043 float64 floatx80_to_float64( floatx80 a )
1044 {
1045 flag aSign;
1046 int32 aExp;
1047 bits64 aSig, zSig;
1048
1049 aSig = extractFloatx80Frac( a );
1050 aExp = extractFloatx80Exp( a );
1051 aSign = extractFloatx80Sign( a );
1052 if ( aExp == 0x7FFF ) {
1053 if ( (bits64) ( aSig<<1 ) ) {
1054 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
1055 }
1056 return packFloat64( aSign, 0x7FF, 0 );
1057 }
1058 shift64RightJamming( aSig, 1, &zSig );
1059 if ( aExp || aSig ) aExp -= 0x3C01;
1060 return roundAndPackFloat64( aSign, aExp, zSig );
1061
1062 }
1063
1064 #ifdef FLOAT128
1065
1066 /*----------------------------------------------------------------------------
1067 | Returns the result of converting the extended double-precision floating-
1068 | point value `a' to the quadruple-precision floating-point format. The
1069 | conversion is performed according to the IEC/IEEE Standard for Binary
1070 | Floating-Point Arithmetic.
1071 *----------------------------------------------------------------------------*/
1072
1073 float128 floatx80_to_float128( floatx80 a )
1074 {
1075 flag aSign;
1076 int16 aExp;
1077 bits64 aSig, zSig0, zSig1;
1078
1079 aSig = extractFloatx80Frac( a );
1080 aExp = extractFloatx80Exp( a );
1081 aSign = extractFloatx80Sign( a );
1082 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
1083 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
1084 }
1085 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
1086 return packFloat128( aSign, aExp, zSig0, zSig1 );
1087
1088 }
1089
1090 #endif
1091
1092 /*----------------------------------------------------------------------------
1093 | Rounds the extended double-precision floating-point value `a' to an integer,
1094 | and returns the result as an extended quadruple-precision floating-point
1095 | value. The operation is performed according to the IEC/IEEE Standard for
1096 | Binary Floating-Point Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 floatx80_round_to_int( floatx80 a )
1100 {
1101 flag aSign;
1102 int32 aExp;
1103 bits64 lastBitMask, roundBitsMask;
1104 int8 roundingMode;
1105 floatx80 z;
1106
1107 aExp = extractFloatx80Exp( a );
1108 if ( 0x403E <= aExp ) {
1109 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
1110 return propagateFloatx80NaN( a, a );
1111 }
1112 return a;
1113 }
1114 if ( aExp < 0x3FFF ) {
1115 if ( ( aExp == 0 )
1116 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
1117 return a;
1118 }
1119 float_exception_flags |= float_flag_inexact;
1120 aSign = extractFloatx80Sign( a );
1121 switch ( float_rounding_mode ) {
1122 case float_round_nearest_even:
1123 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
1124 ) {
1125 return
1126 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
1127 }
1128 break;
1129 case float_round_down:
1130 return
1131 aSign ?
1132 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
1133 : packFloatx80( 0, 0, 0 );
1134 case float_round_up:
1135 return
1136 aSign ? packFloatx80( 1, 0, 0 )
1137 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
1138 }
1139 return packFloatx80( aSign, 0, 0 );
1140 }
1141 lastBitMask = 1;
1142 lastBitMask <<= 0x403E - aExp;
1143 roundBitsMask = lastBitMask - 1;
1144 z = a;
1145 roundingMode = float_rounding_mode;
1146 if ( roundingMode == float_round_nearest_even ) {
1147 z.low += lastBitMask>>1;
1148 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1149 }
1150 else if ( roundingMode != float_round_to_zero ) {
1151 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1152 z.low += roundBitsMask;
1153 }
1154 }
1155 z.low &= ~ roundBitsMask;
1156 if ( z.low == 0 ) {
1157 ++z.high;
1158 z.low = LIT64( 0x8000000000000000 );
1159 }
1160 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
1161 return z;
1162
1163 }
1164
1165 /*----------------------------------------------------------------------------
1166 | Returns the result of adding the absolute values of the extended double-
1167 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
1168 | negated before being returned. `zSign' is ignored if the result is a NaN.
1169 | The addition is performed according to the IEC/IEEE Standard for Binary
1170 | Floating-Point Arithmetic.
1171 *----------------------------------------------------------------------------*/
1172
1173 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
1174 {
1175 int32 aExp, bExp, zExp;
1176 bits64 aSig, bSig, zSig0, zSig1;
1177 int32 expDiff;
1178
1179 aSig = extractFloatx80Frac( a );
1180 aExp = extractFloatx80Exp( a );
1181 bSig = extractFloatx80Frac( b );
1182 bExp = extractFloatx80Exp( b );
1183 expDiff = aExp - bExp;
1184 if ( 0 < expDiff ) {
1185 if ( aExp == 0x7FFF ) {
1186 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
1187 return a;
1188 }
1189 if ( bExp == 0 ) --expDiff;
1190 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
1191 zExp = aExp;
1192 }
1193 else if ( expDiff < 0 ) {
1194 if ( bExp == 0x7FFF ) {
1195 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1196 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1197 }
1198 if ( aExp == 0 ) ++expDiff;
1199 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
1200 zExp = bExp;
1201 }
1202 else {
1203 if ( aExp == 0x7FFF ) {
1204 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
1205 return propagateFloatx80NaN( a, b );
1206 }
1207 return a;
1208 }
1209 zSig1 = 0;
1210 zSig0 = aSig + bSig;
1211 if ( aExp == 0 ) {
1212 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
1213 goto roundAndPack;
1214 }
1215 zExp = aExp;
1216 goto shiftRight1;
1217 }
1218 zSig0 = aSig + bSig;
1219 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
1220 shiftRight1:
1221 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
1222 zSig0 |= LIT64( 0x8000000000000000 );
1223 ++zExp;
1224 roundAndPack:
1225 return
1226 roundAndPackFloatx80(
1227 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1228
1229 }
1230
1231 /*----------------------------------------------------------------------------
1232 | Returns the result of subtracting the absolute values of the extended
1233 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
1234 | difference is negated before being returned. `zSign' is ignored if the
1235 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1236 | Standard for Binary Floating-Point Arithmetic.
1237 *----------------------------------------------------------------------------*/
1238
1239 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
1240 {
1241 int32 aExp, bExp, zExp;
1242 bits64 aSig, bSig, zSig0, zSig1;
1243 int32 expDiff;
1244 floatx80 z;
1245
1246 aSig = extractFloatx80Frac( a );
1247 aExp = extractFloatx80Exp( a );
1248 bSig = extractFloatx80Frac( b );
1249 bExp = extractFloatx80Exp( b );
1250 expDiff = aExp - bExp;
1251 if ( 0 < expDiff ) goto aExpBigger;
1252 if ( expDiff < 0 ) goto bExpBigger;
1253 if ( aExp == 0x7FFF ) {
1254 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
1255 return propagateFloatx80NaN( a, b );
1256 }
1257 float_raise( float_flag_invalid );
1258 z.low = floatx80_default_nan_low;
1259 z.high = floatx80_default_nan_high;
1260 return z;
1261 }
1262 if ( aExp == 0 ) {
1263 aExp = 1;
1264 bExp = 1;
1265 }
1266 zSig1 = 0;
1267 if ( bSig < aSig ) goto aBigger;
1268 if ( aSig < bSig ) goto bBigger;
1269 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
1270 bExpBigger:
1271 if ( bExp == 0x7FFF ) {
1272 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1273 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
1274 }
1275 if ( aExp == 0 ) ++expDiff;
1276 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
1277 bBigger:
1278 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
1279 zExp = bExp;
1280 zSign ^= 1;
1281 goto normalizeRoundAndPack;
1282 aExpBigger:
1283 if ( aExp == 0x7FFF ) {
1284 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
1285 return a;
1286 }
1287 if ( bExp == 0 ) --expDiff;
1288 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
1289 aBigger:
1290 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
1291 zExp = aExp;
1292 normalizeRoundAndPack:
1293 return
1294 normalizeRoundAndPackFloatx80(
1295 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1296
1297 }
1298
1299 /*----------------------------------------------------------------------------
1300 | Returns the result of adding the extended double-precision floating-point
1301 | values `a' and `b'. The operation is performed according to the IEC/IEEE
1302 | Standard for Binary Floating-Point Arithmetic.
1303 *----------------------------------------------------------------------------*/
1304
1305 floatx80 floatx80_add( floatx80 a, floatx80 b )
1306 {
1307 flag aSign, bSign;
1308
1309 aSign = extractFloatx80Sign( a );
1310 bSign = extractFloatx80Sign( b );
1311 if ( aSign == bSign ) {
1312 return addFloatx80Sigs( a, b, aSign );
1313 }
1314 else {
1315 return subFloatx80Sigs( a, b, aSign );
1316 }
1317
1318 }
1319
1320 /*----------------------------------------------------------------------------
1321 | Returns the result of subtracting the extended double-precision floating-
1322 | point values `a' and `b'. The operation is performed according to the
1323 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1324 *----------------------------------------------------------------------------*/
1325
1326 floatx80 floatx80_sub( floatx80 a, floatx80 b )
1327 {
1328 flag aSign, bSign;
1329
1330 aSign = extractFloatx80Sign( a );
1331 bSign = extractFloatx80Sign( b );
1332 if ( aSign == bSign ) {
1333 return subFloatx80Sigs( a, b, aSign );
1334 }
1335 else {
1336 return addFloatx80Sigs( a, b, aSign );
1337 }
1338
1339 }
1340
1341 /*----------------------------------------------------------------------------
1342 | Returns the result of multiplying the extended double-precision floating-
1343 | point values `a' and `b'. The operation is performed according to the
1344 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1345 *----------------------------------------------------------------------------*/
1346
1347 floatx80 floatx80_mul( floatx80 a, floatx80 b )
1348 {
1349 flag aSign, bSign, zSign;
1350 int32 aExp, bExp, zExp;
1351 bits64 aSig, bSig, zSig0, zSig1;
1352 floatx80 z;
1353
1354 aSig = extractFloatx80Frac( a );
1355 aExp = extractFloatx80Exp( a );
1356 aSign = extractFloatx80Sign( a );
1357 bSig = extractFloatx80Frac( b );
1358 bExp = extractFloatx80Exp( b );
1359 bSign = extractFloatx80Sign( b );
1360 zSign = aSign ^ bSign;
1361 if ( aExp == 0x7FFF ) {
1362 if ( (bits64) ( aSig<<1 )
1363 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
1364 return propagateFloatx80NaN( a, b );
1365 }
1366 if ( ( bExp | bSig ) == 0 ) goto invalid;
1367 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1368 }
1369 if ( bExp == 0x7FFF ) {
1370 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1371 if ( ( aExp | aSig ) == 0 ) {
1372 invalid:
1373 float_raise( float_flag_invalid );
1374 z.low = floatx80_default_nan_low;
1375 z.high = floatx80_default_nan_high;
1376 return z;
1377 }
1378 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1379 }
1380 if ( aExp == 0 ) {
1381 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
1382 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
1383 }
1384 if ( bExp == 0 ) {
1385 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
1386 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
1387 }
1388 zExp = aExp + bExp - 0x3FFE;
1389 mul64To128( aSig, bSig, &zSig0, &zSig1 );
1390 if ( 0 < (sbits64) zSig0 ) {
1391 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
1392 --zExp;
1393 }
1394 return
1395 roundAndPackFloatx80(
1396 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1397
1398 }
1399
1400 /*----------------------------------------------------------------------------
1401 | Returns the result of dividing the extended double-precision floating-point
1402 | value `a' by the corresponding value `b'. The operation is performed
1403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1404 *----------------------------------------------------------------------------*/
1405
1406 floatx80 floatx80_div( floatx80 a, floatx80 b )
1407 {
1408 flag aSign, bSign, zSign;
1409 int32 aExp, bExp, zExp;
1410 bits64 aSig, bSig, zSig0, zSig1;
1411 bits64 rem0, rem1, rem2, term0, term1, term2;
1412 floatx80 z;
1413
1414 aSig = extractFloatx80Frac( a );
1415 aExp = extractFloatx80Exp( a );
1416 aSign = extractFloatx80Sign( a );
1417 bSig = extractFloatx80Frac( b );
1418 bExp = extractFloatx80Exp( b );
1419 bSign = extractFloatx80Sign( b );
1420 zSign = aSign ^ bSign;
1421 if ( aExp == 0x7FFF ) {
1422 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
1423 if ( bExp == 0x7FFF ) {
1424 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1425 goto invalid;
1426 }
1427 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1428 }
1429 if ( bExp == 0x7FFF ) {
1430 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1431 return packFloatx80( zSign, 0, 0 );
1432 }
1433 if ( bExp == 0 ) {
1434 if ( bSig == 0 ) {
1435 if ( ( aExp | aSig ) == 0 ) {
1436 invalid:
1437 float_raise( float_flag_invalid );
1438 z.low = floatx80_default_nan_low;
1439 z.high = floatx80_default_nan_high;
1440 return z;
1441 }
1442 float_raise( float_flag_divbyzero );
1443 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1444 }
1445 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
1446 }
1447 if ( aExp == 0 ) {
1448 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
1449 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
1450 }
1451 zExp = aExp - bExp + 0x3FFE;
1452 rem1 = 0;
1453 if ( bSig <= aSig ) {
1454 shift128Right( aSig, 0, 1, &aSig, &rem1 );
1455 ++zExp;
1456 }
1457 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
1458 mul64To128( bSig, zSig0, &term0, &term1 );
1459 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
1460 while ( (sbits64) rem0 < 0 ) {
1461 --zSig0;
1462 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
1463 }
1464 zSig1 = estimateDiv128To64( rem1, 0, bSig );
1465 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
1466 mul64To128( bSig, zSig1, &term1, &term2 );
1467 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
1468 while ( (sbits64) rem1 < 0 ) {
1469 --zSig1;
1470 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
1471 }
1472 zSig1 |= ( ( rem1 | rem2 ) != 0 );
1473 }
1474 return
1475 roundAndPackFloatx80(
1476 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1477
1478 }
1479
1480 /*----------------------------------------------------------------------------
1481 | Returns the remainder of the extended double-precision floating-point value
1482 | `a' with respect to the corresponding value `b'. The operation is performed
1483 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1484 *----------------------------------------------------------------------------*/
1485
1486 floatx80 floatx80_rem( floatx80 a, floatx80 b )
1487 {
1488 flag aSign, bSign, zSign;
1489 int32 aExp, bExp, expDiff;
1490 bits64 aSig0, aSig1, bSig;
1491 bits64 q, term0, term1, alternateASig0, alternateASig1;
1492 floatx80 z;
1493
1494 aSig0 = extractFloatx80Frac( a );
1495 aExp = extractFloatx80Exp( a );
1496 aSign = extractFloatx80Sign( a );
1497 bSig = extractFloatx80Frac( b );
1498 bExp = extractFloatx80Exp( b );
1499 bSign = extractFloatx80Sign( b );
1500 if ( aExp == 0x7FFF ) {
1501 if ( (bits64) ( aSig0<<1 )
1502 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
1503 return propagateFloatx80NaN( a, b );
1504 }
1505 goto invalid;
1506 }
1507 if ( bExp == 0x7FFF ) {
1508 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1509 return a;
1510 }
1511 if ( bExp == 0 ) {
1512 if ( bSig == 0 ) {
1513 invalid:
1514 float_raise( float_flag_invalid );
1515 z.low = floatx80_default_nan_low;
1516 z.high = floatx80_default_nan_high;
1517 return z;
1518 }
1519 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
1520 }
1521 if ( aExp == 0 ) {
1522 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
1523 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
1524 }
1525 bSig |= LIT64( 0x8000000000000000 );
1526 zSign = aSign;
1527 expDiff = aExp - bExp;
1528 aSig1 = 0;
1529 if ( expDiff < 0 ) {
1530 if ( expDiff < -1 ) return a;
1531 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
1532 expDiff = 0;
1533 }
1534 q = ( bSig <= aSig0 );
1535 if ( q ) aSig0 -= bSig;
1536 expDiff -= 64;
1537 while ( 0 < expDiff ) {
1538 q = estimateDiv128To64( aSig0, aSig1, bSig );
1539 q = ( 2 < q ) ? q - 2 : 0;
1540 mul64To128( bSig, q, &term0, &term1 );
1541 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
1542 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
1543 expDiff -= 62;
1544 }
1545 expDiff += 64;
1546 if ( 0 < expDiff ) {
1547 q = estimateDiv128To64( aSig0, aSig1, bSig );
1548 q = ( 2 < q ) ? q - 2 : 0;
1549 q >>= 64 - expDiff;
1550 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
1551 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
1552 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
1553 while ( le128( term0, term1, aSig0, aSig1 ) ) {
1554 ++q;
1555 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
1556 }
1557 }
1558 else {
1559 term1 = 0;
1560 term0 = bSig;
1561 }
1562 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
1563 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
1564 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
1565 && ( q & 1 ) )
1566 ) {
1567 aSig0 = alternateASig0;
1568 aSig1 = alternateASig1;
1569 zSign = ! zSign;
1570 }
1571 return
1572 normalizeRoundAndPackFloatx80(
1573 80, zSign, bExp + expDiff, aSig0, aSig1 );
1574
1575 }
1576
1577 /*----------------------------------------------------------------------------
1578 | Returns the square root of the extended double-precision floating-point
1579 | value `a'. The operation is performed according to the IEC/IEEE Standard
1580 | for Binary Floating-Point Arithmetic.
1581 *----------------------------------------------------------------------------*/
1582
1583 floatx80 floatx80_sqrt( floatx80 a )
1584 {
1585 flag aSign;
1586 int32 aExp, zExp;
1587 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
1588 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1589 floatx80 z;
1590
1591 aSig0 = extractFloatx80Frac( a );
1592 aExp = extractFloatx80Exp( a );
1593 aSign = extractFloatx80Sign( a );
1594 if ( aExp == 0x7FFF ) {
1595 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
1596 if ( ! aSign ) return a;
1597 goto invalid;
1598 }
1599 if ( aSign ) {
1600 if ( ( aExp | aSig0 ) == 0 ) return a;
1601 invalid:
1602 float_raise( float_flag_invalid );
1603 z.low = floatx80_default_nan_low;
1604 z.high = floatx80_default_nan_high;
1605 return z;
1606 }
1607 if ( aExp == 0 ) {
1608 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
1609 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
1610 }
1611 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
1612 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
1613 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
1614 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
1615 doubleZSig0 = zSig0<<1;
1616 mul64To128( zSig0, zSig0, &term0, &term1 );
1617 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
1618 while ( (sbits64) rem0 < 0 ) {
1619 --zSig0;
1620 doubleZSig0 -= 2;
1621 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
1622 }
1623 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
1624 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
1625 if ( zSig1 == 0 ) zSig1 = 1;
1626 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
1627 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
1628 mul64To128( zSig1, zSig1, &term2, &term3 );
1629 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
1630 while ( (sbits64) rem1 < 0 ) {
1631 --zSig1;
1632 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
1633 term3 |= 1;
1634 term2 |= doubleZSig0;
1635 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
1636 }
1637 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1638 }
1639 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
1640 zSig0 |= doubleZSig0;
1641 return
1642 roundAndPackFloatx80(
1643 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
1644
1645 }
1646
1647 /*----------------------------------------------------------------------------
1648 | Returns 1 if the extended double-precision floating-point value `a' is
1649 | equal to the corresponding value `b', and 0 otherwise. The comparison is
1650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1651 | Arithmetic.
1652 *----------------------------------------------------------------------------*/
1653
1654 flag floatx80_eq( floatx80 a, floatx80 b )
1655 {
1656
1657 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
1658 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1659 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
1660 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1661 ) {
1662 if ( floatx80_is_signaling_nan( a )
1663 || floatx80_is_signaling_nan( b ) ) {
1664 float_raise( float_flag_invalid );
1665 }
1666 return 0;
1667 }
1668 return
1669 ( a.low == b.low )
1670 && ( ( a.high == b.high )
1671 || ( ( a.low == 0 )
1672 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
1673 );
1674
1675 }
1676
1677 /*----------------------------------------------------------------------------
1678 | Returns 1 if the extended double-precision floating-point value `a' is
1679 | less than or equal to the corresponding value `b', and 0 otherwise. The
1680 | comparison is performed according to the IEC/IEEE Standard for Binary
1681 | Floating-Point Arithmetic.
1682 *----------------------------------------------------------------------------*/
1683
1684 flag floatx80_le( floatx80 a, floatx80 b )
1685 {
1686 flag aSign, bSign;
1687
1688 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
1689 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1690 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
1691 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1692 ) {
1693 float_raise( float_flag_invalid );
1694 return 0;
1695 }
1696 aSign = extractFloatx80Sign( a );
1697 bSign = extractFloatx80Sign( b );
1698 if ( aSign != bSign ) {
1699 return
1700 aSign
1701 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1702 == 0 );
1703 }
1704 return
1705 aSign ? le128( b.high, b.low, a.high, a.low )
1706 : le128( a.high, a.low, b.high, b.low );
1707
1708 }
1709
1710 /*----------------------------------------------------------------------------
1711 | Returns 1 if the extended double-precision floating-point value `a' is
1712 | less than the corresponding value `b', and 0 otherwise. The comparison
1713 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1714 | Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 flag floatx80_lt( floatx80 a, floatx80 b )
1718 {
1719 flag aSign, bSign;
1720
1721 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
1722 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1723 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
1724 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1725 ) {
1726 float_raise( float_flag_invalid );
1727 return 0;
1728 }
1729 aSign = extractFloatx80Sign( a );
1730 bSign = extractFloatx80Sign( b );
1731 if ( aSign != bSign ) {
1732 return
1733 aSign
1734 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1735 != 0 );
1736 }
1737 return
1738 aSign ? lt128( b.high, b.low, a.high, a.low )
1739 : lt128( a.high, a.low, b.high, b.low );
1740
1741 }
1742
1743 /*----------------------------------------------------------------------------
1744 | Returns 1 if the extended double-precision floating-point value `a' is equal
1745 | to the corresponding value `b', and 0 otherwise. The invalid exception is
1746 | raised if either operand is a NaN. Otherwise, the comparison is performed
1747 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1748 *----------------------------------------------------------------------------*/
1749
1750 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
1751 {
1752
1753 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
1754 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1755 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
1756 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1757 ) {
1758 float_raise( float_flag_invalid );
1759 return 0;
1760 }
1761 return
1762 ( a.low == b.low )
1763 && ( ( a.high == b.high )
1764 || ( ( a.low == 0 )
1765 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
1766 );
1767
1768 }
1769
1770 /*----------------------------------------------------------------------------
1771 | Returns 1 if the extended double-precision floating-point value `a' is less
1772 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
1773 | do not cause an exception. Otherwise, the comparison is performed according
1774 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1775 *----------------------------------------------------------------------------*/
1776
1777 flag floatx80_le_quiet( floatx80 a, floatx80 b )
1778 {
1779 flag aSign, bSign;
1780
1781 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
1782 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1783 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
1784 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1785 ) {
1786 if ( floatx80_is_signaling_nan( a )
1787 || floatx80_is_signaling_nan( b ) ) {
1788 float_raise( float_flag_invalid );
1789 }
1790 return 0;
1791 }
1792 aSign = extractFloatx80Sign( a );
1793 bSign = extractFloatx80Sign( b );
1794 if ( aSign != bSign ) {
1795 return
1796 aSign
1797 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1798 == 0 );
1799 }
1800 return
1801 aSign ? le128( b.high, b.low, a.high, a.low )
1802 : le128( a.high, a.low, b.high, b.low );
1803
1804 }
1805
1806 /*----------------------------------------------------------------------------
1807 | Returns 1 if the extended double-precision floating-point value `a' is less
1808 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
1809 | an exception. Otherwise, the comparison is performed according to the
1810 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1811 *----------------------------------------------------------------------------*/
1812
1813 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
1814 {
1815 flag aSign, bSign;
1816
1817 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
1818 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1819 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
1820 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1821 ) {
1822 if ( floatx80_is_signaling_nan( a )
1823 || floatx80_is_signaling_nan( b ) ) {
1824 float_raise( float_flag_invalid );
1825 }
1826 return 0;
1827 }
1828 aSign = extractFloatx80Sign( a );
1829 bSign = extractFloatx80Sign( b );
1830 if ( aSign != bSign ) {
1831 return
1832 aSign
1833 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1834 != 0 );
1835 }
1836 return
1837 aSign ? lt128( b.high, b.low, a.high, a.low )
1838 : lt128( a.high, a.low, b.high, b.low );
1839
1840 }
1841
1842 #endif
1843
1844 #ifdef FLOAT128
1845
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of converting the quadruple-precision floating-point
1848 | value `a' to the 32-bit two's complement integer format. The conversion
1849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1850 | Arithmetic---which means in particular that the conversion is rounded
1851 | according to the current rounding mode. If `a' is a NaN, the largest
1852 | positive integer is returned. Otherwise, if the conversion overflows, the
1853 | largest integer with the same sign as `a' is returned.
1854 *----------------------------------------------------------------------------*/
1855
1856 int32 float128_to_int32( float128 a )
1857 {
1858 flag aSign;
1859 int32 aExp, shiftCount;
1860 bits64 aSig0, aSig1;
1861
1862 aSig1 = extractFloat128Frac1( a );
1863 aSig0 = extractFloat128Frac0( a );
1864 aExp = extractFloat128Exp( a );
1865 aSign = extractFloat128Sign( a );
1866 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1867 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
1868 aSig0 |= ( aSig1 != 0 );
1869 shiftCount = 0x4028 - aExp;
1870 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
1871 return roundAndPackInt32( aSign, aSig0 );
1872
1873 }
1874
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of converting the quadruple-precision floating-point
1877 | value `a' to the 32-bit two's complement integer format. The conversion
1878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1879 | Arithmetic, except that the conversion is always rounded toward zero. If
1880 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1881 | conversion overflows, the largest integer with the same sign as `a' is
1882 | returned.
1883 *----------------------------------------------------------------------------*/
1884
1885 int32 float128_to_int32_round_to_zero( float128 a )
1886 {
1887 flag aSign;
1888 int32 aExp, shiftCount;
1889 bits64 aSig0, aSig1, savedASig;
1890 int32 z;
1891
1892 aSig1 = extractFloat128Frac1( a );
1893 aSig0 = extractFloat128Frac0( a );
1894 aExp = extractFloat128Exp( a );
1895 aSign = extractFloat128Sign( a );
1896 aSig0 |= ( aSig1 != 0 );
1897 if ( 0x401E < aExp ) {
1898 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
1899 goto invalid;
1900 }
1901 else if ( aExp < 0x3FFF ) {
1902 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
1903 return 0;
1904 }
1905 aSig0 |= LIT64( 0x0001000000000000 );
1906 shiftCount = 0x402F - aExp;
1907 savedASig = aSig0;
1908 aSig0 >>= shiftCount;
1909 z = aSig0;
1910 if ( aSign ) z = - z;
1911 if ( ( z < 0 ) ^ aSign ) {
1912 invalid:
1913 float_raise( float_flag_invalid );
1914 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1915 }
1916 if ( ( aSig0<<shiftCount ) != savedASig ) {
1917 float_exception_flags |= float_flag_inexact;
1918 }
1919 return z;
1920
1921 }
1922
1923 /*----------------------------------------------------------------------------
1924 | Returns the result of converting the quadruple-precision floating-point
1925 | value `a' to the 64-bit two's complement integer format. The conversion
1926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1927 | Arithmetic---which means in particular that the conversion is rounded
1928 | according to the current rounding mode. If `a' is a NaN, the largest
1929 | positive integer is returned. Otherwise, if the conversion overflows, the
1930 | largest integer with the same sign as `a' is returned.
1931 *----------------------------------------------------------------------------*/
1932
1933 int64 float128_to_int64( float128 a )
1934 {
1935 flag aSign;
1936 int32 aExp, shiftCount;
1937 bits64 aSig0, aSig1;
1938
1939 aSig1 = extractFloat128Frac1( a );
1940 aSig0 = extractFloat128Frac0( a );
1941 aExp = extractFloat128Exp( a );
1942 aSign = extractFloat128Sign( a );
1943 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
1944 shiftCount = 0x402F - aExp;
1945 if ( shiftCount <= 0 ) {
1946 if ( 0x403E < aExp ) {
1947 float_raise( float_flag_invalid );
1948 if ( ! aSign
1949 || ( ( aExp == 0x7FFF )
1950 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
1951 )
1952 ) {
1953 return LIT64( 0x7FFFFFFFFFFFFFFF );
1954 }
1955 return (sbits64) LIT64( 0x8000000000000000 );
1956 }
1957 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
1958 }
1959 else {
1960 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
1961 }
1962 return roundAndPackInt64( aSign, aSig0, aSig1 );
1963
1964 }
1965
1966 /*----------------------------------------------------------------------------
1967 | Returns the result of converting the quadruple-precision floating-point
1968 | value `a' to the 64-bit two's complement integer format. The conversion
1969 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1970 | Arithmetic, except that the conversion is always rounded toward zero.
1971 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1972 | the conversion overflows, the largest integer with the same sign as `a' is
1973 | returned.
1974 *----------------------------------------------------------------------------*/
1975
1976 int64 float128_to_int64_round_to_zero( float128 a )
1977 {
1978 flag aSign;
1979 int32 aExp, shiftCount;
1980 bits64 aSig0, aSig1;
1981 int64 z;
1982
1983 aSig1 = extractFloat128Frac1( a );
1984 aSig0 = extractFloat128Frac0( a );
1985 aExp = extractFloat128Exp( a );
1986 aSign = extractFloat128Sign( a );
1987 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
1988 shiftCount = aExp - 0x402F;
1989 if ( 0 < shiftCount ) {
1990 if ( 0x403E <= aExp ) {
1991 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
1992 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
1993 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
1994 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
1995 }
1996 else {
1997 float_raise( float_flag_invalid );
1998 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
1999 return LIT64( 0x7FFFFFFFFFFFFFFF );
2000 }
2001 }
2002 return (sbits64) LIT64( 0x8000000000000000 );
2003 }
2004 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
2005 if ( (bits64) ( aSig1<<shiftCount ) ) {
2006 float_exception_flags |= float_flag_inexact;
2007 }
2008 }
2009 else {
2010 if ( aExp < 0x3FFF ) {
2011 if ( aExp | aSig0 | aSig1 ) {
2012 float_exception_flags |= float_flag_inexact;
2013 }
2014 return 0;
2015 }
2016 z = aSig0>>( - shiftCount );
2017 if ( aSig1
2018 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
2019 float_exception_flags |= float_flag_inexact;
2020 }
2021 }
2022 if ( aSign ) z = - z;
2023 return z;
2024
2025 }
2026
2027 /*----------------------------------------------------------------------------
2028 | Returns the result of converting the quadruple-precision floating-point
2029 | value `a' to the single-precision floating-point format. The conversion
2030 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2031 | Arithmetic.
2032 *----------------------------------------------------------------------------*/
2033
2034 float32 float128_to_float32( float128 a )
2035 {
2036 flag aSign;
2037 int32 aExp;
2038 bits64 aSig0, aSig1;
2039 bits32 zSig;
2040
2041 aSig1 = extractFloat128Frac1( a );
2042 aSig0 = extractFloat128Frac0( a );
2043 aExp = extractFloat128Exp( a );
2044 aSign = extractFloat128Sign( a );
2045 if ( aExp == 0x7FFF ) {
2046 if ( aSig0 | aSig1 ) {
2047 return commonNaNToFloat32( float128ToCommonNaN( a ) );
2048 }
2049 return packFloat32( aSign, 0xFF, 0 );
2050 }
2051 aSig0 |= ( aSig1 != 0 );
2052 shift64RightJamming( aSig0, 18, &aSig0 );
2053 zSig = aSig0;
2054 if ( aExp || zSig ) {
2055 zSig |= 0x40000000;
2056 aExp -= 0x3F81;
2057 }
2058 return roundAndPackFloat32( aSign, aExp, zSig );
2059
2060 }
2061
2062 /*----------------------------------------------------------------------------
2063 | Returns the result of converting the quadruple-precision floating-point
2064 | value `a' to the double-precision floating-point format. The conversion
2065 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2066 | Arithmetic.
2067 *----------------------------------------------------------------------------*/
2068
2069 float64 float128_to_float64( float128 a )
2070 {
2071 flag aSign;
2072 int32 aExp;
2073 bits64 aSig0, aSig1;
2074
2075 aSig1 = extractFloat128Frac1( a );
2076 aSig0 = extractFloat128Frac0( a );
2077 aExp = extractFloat128Exp( a );
2078 aSign = extractFloat128Sign( a );
2079 if ( aExp == 0x7FFF ) {
2080 if ( aSig0 | aSig1 ) {
2081 return commonNaNToFloat64( float128ToCommonNaN( a ) );
2082 }
2083 return packFloat64( aSign, 0x7FF, 0 );
2084 }
2085 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
2086 aSig0 |= ( aSig1 != 0 );
2087 if ( aExp || aSig0 ) {
2088 aSig0 |= LIT64( 0x4000000000000000 );
2089 aExp -= 0x3C01;
2090 }
2091 return roundAndPackFloat64( aSign, aExp, aSig0 );
2092
2093 }
2094
2095 #ifdef FLOATX80
2096
2097 /*----------------------------------------------------------------------------
2098 | Returns the result of converting the quadruple-precision floating-point
2099 | value `a' to the extended double-precision floating-point format. The
2100 | conversion is performed according to the IEC/IEEE Standard for Binary
2101 | Floating-Point Arithmetic.
2102 *----------------------------------------------------------------------------*/
2103
2104 floatx80 float128_to_floatx80( float128 a )
2105 {
2106 flag aSign;
2107 int32 aExp;
2108 bits64 aSig0, aSig1;
2109
2110 aSig1 = extractFloat128Frac1( a );
2111 aSig0 = extractFloat128Frac0( a );
2112 aExp = extractFloat128Exp( a );
2113 aSign = extractFloat128Sign( a );
2114 if ( aExp == 0x7FFF ) {
2115 if ( aSig0 | aSig1 ) {
2116 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
2117 }
2118 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2119 }
2120 if ( aExp == 0 ) {
2121 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
2122 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2123 }
2124 else {
2125 aSig0 |= LIT64( 0x0001000000000000 );
2126 }
2127 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
2128 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
2129
2130 }
2131
2132 #endif
2133
2134 /*----------------------------------------------------------------------------
2135 | Rounds the quadruple-precision floating-point value `a' to an integer, and
2136 | returns the result as a quadruple-precision floating-point value. The
2137 | operation is performed according to the IEC/IEEE Standard for Binary
2138 | Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140
2141 float128 float128_round_to_int( float128 a )
2142 {
2143 flag aSign;
2144 int32 aExp;
2145 bits64 lastBitMask, roundBitsMask;
2146 int8 roundingMode;
2147 float128 z;
2148
2149 aExp = extractFloat128Exp( a );
2150 if ( 0x402F <= aExp ) {
2151 if ( 0x406F <= aExp ) {
2152 if ( ( aExp == 0x7FFF )
2153 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
2154 ) {
2155 return propagateFloat128NaN( a, a );
2156 }
2157 return a;
2158 }
2159 lastBitMask = 1;
2160 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
2161 roundBitsMask = lastBitMask - 1;
2162 z = a;
2163 roundingMode = float_rounding_mode;
2164 if ( roundingMode == float_round_nearest_even ) {
2165 if ( lastBitMask ) {
2166 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
2167 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2168 }
2169 else {
2170 if ( (sbits64) z.low < 0 ) {
2171 ++z.high;
2172 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
2173 }
2174 }
2175 }
2176 else if ( roundingMode != float_round_to_zero ) {
2177 if ( extractFloat128Sign( z )
2178 ^ ( roundingMode == float_round_up ) ) {
2179 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
2180 }
2181 }
2182 z.low &= ~ roundBitsMask;
2183 }
2184 else {
2185 if ( aExp < 0x3FFF ) {
2186 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
2187 float_exception_flags |= float_flag_inexact;
2188 aSign = extractFloat128Sign( a );
2189 switch ( float_rounding_mode ) {
2190 case float_round_nearest_even:
2191 if ( ( aExp == 0x3FFE )
2192 && ( extractFloat128Frac0( a )
2193 | extractFloat128Frac1( a ) )
2194 ) {
2195 return packFloat128( aSign, 0x3FFF, 0, 0 );
2196 }
2197 break;
2198 case float_round_down:
2199 return
2200 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
2201 : packFloat128( 0, 0, 0, 0 );
2202 case float_round_up:
2203 return
2204 aSign ? packFloat128( 1, 0, 0, 0 )
2205 : packFloat128( 0, 0x3FFF, 0, 0 );
2206 }
2207 return packFloat128( aSign, 0, 0, 0 );
2208 }
2209 lastBitMask = 1;
2210 lastBitMask <<= 0x402F - aExp;
2211 roundBitsMask = lastBitMask - 1;
2212 z.low = 0;
2213 z.high = a.high;
2214 roundingMode = float_rounding_mode;
2215 if ( roundingMode == float_round_nearest_even ) {
2216 z.high += lastBitMask>>1;
2217 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
2218 z.high &= ~ lastBitMask;
2219 }
2220 }
2221 else if ( roundingMode != float_round_to_zero ) {
2222 if ( extractFloat128Sign( z )
2223 ^ ( roundingMode == float_round_up ) ) {
2224 z.high |= ( a.low != 0 );
2225 z.high += roundBitsMask;
2226 }
2227 }
2228 z.high &= ~ roundBitsMask;
2229 }
2230 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
2231 float_exception_flags |= float_flag_inexact;
2232 }
2233 return z;
2234
2235 }
2236
2237 /*----------------------------------------------------------------------------
2238 | Returns the result of adding the absolute values of the quadruple-precision
2239 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2240 | before being returned. `zSign' is ignored if the result is a NaN.
2241 | The addition is performed according to the IEC/IEEE Standard for Binary
2242 | Floating-Point Arithmetic.
2243 *----------------------------------------------------------------------------*/
2244
2245 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
2246 {
2247 int32 aExp, bExp, zExp;
2248 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
2249 int32 expDiff;
2250
2251 aSig1 = extractFloat128Frac1( a );
2252 aSig0 = extractFloat128Frac0( a );
2253 aExp = extractFloat128Exp( a );
2254 bSig1 = extractFloat128Frac1( b );
2255 bSig0 = extractFloat128Frac0( b );
2256 bExp = extractFloat128Exp( b );
2257 expDiff = aExp - bExp;
2258 if ( 0 < expDiff ) {
2259 if ( aExp == 0x7FFF ) {
2260 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
2261 return a;
2262 }
2263 if ( bExp == 0 ) {
2264 --expDiff;
2265 }
2266 else {
2267 bSig0 |= LIT64( 0x0001000000000000 );
2268 }
2269 shift128ExtraRightJamming(
2270 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
2271 zExp = aExp;
2272 }
2273 else if ( expDiff < 0 ) {
2274 if ( bExp == 0x7FFF ) {
2275 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2276 return packFloat128( zSign, 0x7FFF, 0, 0 );
2277 }
2278 if ( aExp == 0 ) {
2279 ++expDiff;
2280 }
2281 else {
2282 aSig0 |= LIT64( 0x0001000000000000 );
2283 }
2284 shift128ExtraRightJamming(
2285 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
2286 zExp = bExp;
2287 }
2288 else {
2289 if ( aExp == 0x7FFF ) {
2290 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
2291 return propagateFloat128NaN( a, b );
2292 }
2293 return a;
2294 }
2295 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
2296 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
2297 zSig2 = 0;
2298 zSig0 |= LIT64( 0x0002000000000000 );
2299 zExp = aExp;
2300 goto shiftRight1;
2301 }
2302 aSig0 |= LIT64( 0x0001000000000000 );
2303 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
2304 --zExp;
2305 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
2306 ++zExp;
2307 shiftRight1:
2308 shift128ExtraRightJamming(
2309 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
2310 roundAndPack:
2311 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2312
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns the result of subtracting the absolute values of the quadruple-
2317 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2318 | difference is negated before being returned. `zSign' is ignored if the
2319 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2320 | Standard for Binary Floating-Point Arithmetic.
2321 *----------------------------------------------------------------------------*/
2322
2323 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
2324 {
2325 int32 aExp, bExp, zExp;
2326 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
2327 int32 expDiff;
2328 float128 z;
2329
2330 aSig1 = extractFloat128Frac1( a );
2331 aSig0 = extractFloat128Frac0( a );
2332 aExp = extractFloat128Exp( a );
2333 bSig1 = extractFloat128Frac1( b );
2334 bSig0 = extractFloat128Frac0( b );
2335 bExp = extractFloat128Exp( b );
2336 expDiff = aExp - bExp;
2337 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
2338 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
2339 if ( 0 < expDiff ) goto aExpBigger;
2340 if ( expDiff < 0 ) goto bExpBigger;
2341 if ( aExp == 0x7FFF ) {
2342 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
2343 return propagateFloat128NaN( a, b );
2344 }
2345 float_raise( float_flag_invalid );
2346 z.low = float128_default_nan_low;
2347 z.high = float128_default_nan_high;
2348 return z;
2349 }
2350 if ( aExp == 0 ) {
2351 aExp = 1;
2352 bExp = 1;
2353 }
2354 if ( bSig0 < aSig0 ) goto aBigger;
2355 if ( aSig0 < bSig0 ) goto bBigger;
2356 if ( bSig1 < aSig1 ) goto aBigger;
2357 if ( aSig1 < bSig1 ) goto bBigger;
2358 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
2359 bExpBigger:
2360 if ( bExp == 0x7FFF ) {
2361 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2362 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
2363 }
2364 if ( aExp == 0 ) {
2365 ++expDiff;
2366 }
2367 else {
2368 aSig0 |= LIT64( 0x4000000000000000 );
2369 }
2370 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2371 bSig0 |= LIT64( 0x4000000000000000 );
2372 bBigger:
2373 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
2374 zExp = bExp;
2375 zSign ^= 1;
2376 goto normalizeRoundAndPack;
2377 aExpBigger:
2378 if ( aExp == 0x7FFF ) {
2379 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
2380 return a;
2381 }
2382 if ( bExp == 0 ) {
2383 --expDiff;
2384 }
2385 else {
2386 bSig0 |= LIT64( 0x4000000000000000 );
2387 }
2388 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
2389 aSig0 |= LIT64( 0x4000000000000000 );
2390 aBigger:
2391 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
2392 zExp = aExp;
2393 normalizeRoundAndPack:
2394 --zExp;
2395 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
2396
2397 }
2398
2399 /*----------------------------------------------------------------------------
2400 | Returns the result of adding the quadruple-precision floating-point values
2401 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2402 | for Binary Floating-Point Arithmetic.
2403 *----------------------------------------------------------------------------*/
2404
2405 float128 float128_add( float128 a, float128 b )
2406 {
2407 flag aSign, bSign;
2408
2409 aSign = extractFloat128Sign( a );
2410 bSign = extractFloat128Sign( b );
2411 if ( aSign == bSign ) {
2412 return addFloat128Sigs( a, b, aSign );
2413 }
2414 else {
2415 return subFloat128Sigs( a, b, aSign );
2416 }
2417
2418 }
2419
2420 /*----------------------------------------------------------------------------
2421 | Returns the result of subtracting the quadruple-precision floating-point
2422 | values `a' and `b'. The operation is performed according to the IEC/IEEE
2423 | Standard for Binary Floating-Point Arithmetic.
2424 *----------------------------------------------------------------------------*/
2425
2426 float128 float128_sub( float128 a, float128 b )
2427 {
2428 flag aSign, bSign;
2429
2430 aSign = extractFloat128Sign( a );
2431 bSign = extractFloat128Sign( b );
2432 if ( aSign == bSign ) {
2433 return subFloat128Sigs( a, b, aSign );
2434 }
2435 else {
2436 return addFloat128Sigs( a, b, aSign );
2437 }
2438
2439 }
2440
2441 /*----------------------------------------------------------------------------
2442 | Returns the result of multiplying the quadruple-precision floating-point
2443 | values `a' and `b'. The operation is performed according to the IEC/IEEE
2444 | Standard for Binary Floating-Point Arithmetic.
2445 *----------------------------------------------------------------------------*/
2446
2447 float128 float128_mul( float128 a, float128 b )
2448 {
2449 flag aSign, bSign, zSign;
2450 int32 aExp, bExp, zExp;
2451 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
2452 float128 z;
2453
2454 aSig1 = extractFloat128Frac1( a );
2455 aSig0 = extractFloat128Frac0( a );
2456 aExp = extractFloat128Exp( a );
2457 aSign = extractFloat128Sign( a );
2458 bSig1 = extractFloat128Frac1( b );
2459 bSig0 = extractFloat128Frac0( b );
2460 bExp = extractFloat128Exp( b );
2461 bSign = extractFloat128Sign( b );
2462 zSign = aSign ^ bSign;
2463 if ( aExp == 0x7FFF ) {
2464 if ( ( aSig0 | aSig1 )
2465 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
2466 return propagateFloat128NaN( a, b );
2467 }
2468 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
2469 return packFloat128( zSign, 0x7FFF, 0, 0 );
2470 }
2471 if ( bExp == 0x7FFF ) {
2472 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2473 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
2474 invalid:
2475 float_raise( float_flag_invalid );
2476 z.low = float128_default_nan_low;
2477 z.high = float128_default_nan_high;
2478 return z;
2479 }
2480 return packFloat128( zSign, 0x7FFF, 0, 0 );
2481 }
2482 if ( aExp == 0 ) {
2483 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
2484 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2485 }
2486 if ( bExp == 0 ) {
2487 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
2488 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2489 }
2490 zExp = aExp + bExp - 0x4000;
2491 aSig0 |= LIT64( 0x0001000000000000 );
2492 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
2493 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
2494 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
2495 zSig2 |= ( zSig3 != 0 );
2496 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
2497 shift128ExtraRightJamming(
2498 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
2499 ++zExp;
2500 }
2501 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2502
2503 }
2504
2505 /*----------------------------------------------------------------------------
2506 | Returns the result of dividing the quadruple-precision floating-point value
2507 | `a' by the corresponding value `b'. The operation is performed according to
2508 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2509 *----------------------------------------------------------------------------*/
2510
2511 float128 float128_div( float128 a, float128 b )
2512 {
2513 flag aSign, bSign, zSign;
2514 int32 aExp, bExp, zExp;
2515 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
2516 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2517 float128 z;
2518
2519 aSig1 = extractFloat128Frac1( a );
2520 aSig0 = extractFloat128Frac0( a );
2521 aExp = extractFloat128Exp( a );
2522 aSign = extractFloat128Sign( a );
2523 bSig1 = extractFloat128Frac1( b );
2524 bSig0 = extractFloat128Frac0( b );
2525 bExp = extractFloat128Exp( b );
2526 bSign = extractFloat128Sign( b );
2527 zSign = aSign ^ bSign;
2528 if ( aExp == 0x7FFF ) {
2529 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
2530 if ( bExp == 0x7FFF ) {
2531 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2532 goto invalid;
2533 }
2534 return packFloat128( zSign, 0x7FFF, 0, 0 );
2535 }
2536 if ( bExp == 0x7FFF ) {
2537 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2538 return packFloat128( zSign, 0, 0, 0 );
2539 }
2540 if ( bExp == 0 ) {
2541 if ( ( bSig0 | bSig1 ) == 0 ) {
2542 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
2543 invalid:
2544 float_raise( float_flag_invalid );
2545 z.low = float128_default_nan_low;
2546 z.high = float128_default_nan_high;
2547 return z;
2548 }
2549 float_raise( float_flag_divbyzero );
2550 return packFloat128( zSign, 0x7FFF, 0, 0 );
2551 }
2552 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2553 }
2554 if ( aExp == 0 ) {
2555 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
2556 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2557 }
2558 zExp = aExp - bExp + 0x3FFD;
2559 shortShift128Left(
2560 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
2561 shortShift128Left(
2562 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
2563 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
2564 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
2565 ++zExp;
2566 }
2567 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
2568 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
2569 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
2570 while ( (sbits64) rem0 < 0 ) {
2571 --zSig0;
2572 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
2573 }
2574 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
2575 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
2576 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
2577 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
2578 while ( (sbits64) rem1 < 0 ) {
2579 --zSig1;
2580 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
2581 }
2582 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2583 }
2584 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
2585 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2586
2587 }
2588
2589 /*----------------------------------------------------------------------------
2590 | Returns the remainder of the quadruple-precision floating-point value `a'
2591 | with respect to the corresponding value `b'. The operation is performed
2592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2593 *----------------------------------------------------------------------------*/
2594
2595 float128 float128_rem( float128 a, float128 b )
2596 {
2597 flag aSign, bSign, zSign;
2598 int32 aExp, bExp, expDiff;
2599 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2600 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
2601 sbits64 sigMean0;
2602 float128 z;
2603
2604 aSig1 = extractFloat128Frac1( a );
2605 aSig0 = extractFloat128Frac0( a );
2606 aExp = extractFloat128Exp( a );
2607 aSign = extractFloat128Sign( a );
2608 bSig1 = extractFloat128Frac1( b );
2609 bSig0 = extractFloat128Frac0( b );
2610 bExp = extractFloat128Exp( b );
2611 bSign = extractFloat128Sign( b );
2612 if ( aExp == 0x7FFF ) {
2613 if ( ( aSig0 | aSig1 )
2614 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
2615 return propagateFloat128NaN( a, b );
2616 }
2617 goto invalid;
2618 }
2619 if ( bExp == 0x7FFF ) {
2620 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2621 return a;
2622 }
2623 if ( bExp == 0 ) {
2624 if ( ( bSig0 | bSig1 ) == 0 ) {
2625 invalid:
2626 float_raise( float_flag_invalid );
2627 z.low = float128_default_nan_low;
2628 z.high = float128_default_nan_high;
2629 return z;
2630 }
2631 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2632 }
2633 if ( aExp == 0 ) {
2634 if ( ( aSig0 | aSig1 ) == 0 ) return a;
2635 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2636 }
2637 expDiff = aExp - bExp;
2638 if ( expDiff < -1 ) return a;
2639 shortShift128Left(
2640 aSig0 | LIT64( 0x0001000000000000 ),
2641 aSig1,
2642 15 - ( expDiff < 0 ),
2643 &aSig0,
2644 &aSig1
2645 );
2646 shortShift128Left(
2647 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
2648 q = le128( bSig0, bSig1, aSig0, aSig1 );
2649 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2650 expDiff -= 64;
2651 while ( 0 < expDiff ) {
2652 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
2653 q = ( 4 < q ) ? q - 4 : 0;
2654 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
2655 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
2656 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
2657 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2658 expDiff -= 61;
2659 }
2660 if ( -64 < expDiff ) {
2661 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
2662 q = ( 4 < q ) ? q - 4 : 0;
2663 q >>= - expDiff;
2664 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
2665 expDiff += 52;
2666 if ( expDiff < 0 ) {
2667 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2668 }
2669 else {
2670 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2671 }
2672 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
2673 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2674 }
2675 else {
2676 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
2677 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
2678 }
2679 do {
2680 alternateASig0 = aSig0;
2681 alternateASig1 = aSig1;
2682 ++q;
2683 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2684 } while ( 0 <= (sbits64) aSig0 );
2685 add128(
2686 aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2687 if ( ( sigMean0 < 0 )
2688 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2689 aSig0 = alternateASig0;
2690 aSig1 = alternateASig1;
2691 }
2692 zSign = ( (sbits64) aSig0 < 0 );
2693 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2694 return
2695 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2696
2697 }
2698
2699 /*----------------------------------------------------------------------------
2700 | Returns the square root of the quadruple-precision floating-point value `a'.
2701 | The operation is performed according to the IEC/IEEE Standard for Binary
2702 | Floating-Point Arithmetic.
2703 *----------------------------------------------------------------------------*/
2704
2705 float128 float128_sqrt( float128 a )
2706 {
2707 flag aSign;
2708 int32 aExp, zExp;
2709 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2710 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2711 float128 z;
2712
2713 aSig1 = extractFloat128Frac1( a );
2714 aSig0 = extractFloat128Frac0( a );
2715 aExp = extractFloat128Exp( a );
2716 aSign = extractFloat128Sign( a );
2717 if ( aExp == 0x7FFF ) {
2718 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
2719 if ( ! aSign ) return a;
2720 goto invalid;
2721 }
2722 if ( aSign ) {
2723 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2724 invalid:
2725 float_raise( float_flag_invalid );
2726 z.low = float128_default_nan_low;
2727 z.high = float128_default_nan_high;
2728 return z;
2729 }
2730 if ( aExp == 0 ) {
2731 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
2732 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2733 }
2734 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
2735 aSig0 |= LIT64( 0x0001000000000000 );
2736 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
2737 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
2738 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
2739 doubleZSig0 = zSig0<<1;
2740 mul64To128( zSig0, zSig0, &term0, &term1 );
2741 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2742 while ( (sbits64) rem0 < 0 ) {
2743 --zSig0;
2744 doubleZSig0 -= 2;
2745 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
2746 }
2747 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
2748 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
2749 if ( zSig1 == 0 ) zSig1 = 1;
2750 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
2751 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
2752 mul64To128( zSig1, zSig1, &term2, &term3 );
2753 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2754 while ( (sbits64) rem1 < 0 ) {
2755 --zSig1;
2756 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
2757 term3 |= 1;
2758 term2 |= doubleZSig0;
2759 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2760 }
2761 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2762 }
2763 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
2764 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
2765
2766 }
2767
2768 /*----------------------------------------------------------------------------
2769 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
2770 | the corresponding value `b', and 0 otherwise. The comparison is performed
2771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2772 *----------------------------------------------------------------------------*/
2773
2774 flag float128_eq( float128 a, float128 b )
2775 {
2776
2777 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
2778 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2779 || ( ( extractFloat128Exp( b ) == 0x7FFF )
2780 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2781 ) {
2782 if ( float128_is_signaling_nan( a )
2783 || float128_is_signaling_nan( b ) ) {
2784 float_raise( float_flag_invalid );
2785 }
2786 return 0;
2787 }
2788 return
2789 ( a.low == b.low )
2790 && ( ( a.high == b.high )
2791 || ( ( a.low == 0 )
2792 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
2793 );
2794
2795 }
2796
2797 /*----------------------------------------------------------------------------
2798 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2799 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2800 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2801 | Arithmetic.
2802 *----------------------------------------------------------------------------*/
2803
2804 flag float128_le( float128 a, float128 b )
2805 {
2806 flag aSign, bSign;
2807
2808 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
2809 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2810 || ( ( extractFloat128Exp( b ) == 0x7FFF )
2811 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2812 ) {
2813 float_raise( float_flag_invalid );
2814 return 0;
2815 }
2816 aSign = extractFloat128Sign( a );
2817 bSign = extractFloat128Sign( b );
2818 if ( aSign != bSign ) {
2819 return
2820 aSign
2821 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2822 == 0 );
2823 }
2824 return
2825 aSign ? le128( b.high, b.low, a.high, a.low )
2826 : le128( a.high, a.low, b.high, b.low );
2827
2828 }
2829
2830 /*----------------------------------------------------------------------------
2831 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2832 | the corresponding value `b', and 0 otherwise. The comparison is performed
2833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2834 *----------------------------------------------------------------------------*/
2835
2836 flag float128_lt( float128 a, float128 b )
2837 {
2838 flag aSign, bSign;
2839
2840 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
2841 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2842 || ( ( extractFloat128Exp( b ) == 0x7FFF )
2843 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2844 ) {
2845 float_raise( float_flag_invalid );
2846 return 0;
2847 }
2848 aSign = extractFloat128Sign( a );
2849 bSign = extractFloat128Sign( b );
2850 if ( aSign != bSign ) {
2851 return
2852 aSign
2853 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2854 != 0 );
2855 }
2856 return
2857 aSign ? lt128( b.high, b.low, a.high, a.low )
2858 : lt128( a.high, a.low, b.high, b.low );
2859
2860 }
2861
2862 /*----------------------------------------------------------------------------
2863 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
2864 | the corresponding value `b', and 0 otherwise. The invalid exception is
2865 | raised if either operand is a NaN. Otherwise, the comparison is performed
2866 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2868
2869 flag float128_eq_signaling( float128 a, float128 b )
2870 {
2871
2872 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
2873 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2874 || ( ( extractFloat128Exp( b ) == 0x7FFF )
2875 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2876 ) {
2877 float_raise( float_flag_invalid );
2878 return 0;
2879 }
2880 return
2881 ( a.low == b.low )
2882 && ( ( a.high == b.high )
2883 || ( ( a.low == 0 )
2884 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
2885 );
2886
2887 }
2888
2889 /*----------------------------------------------------------------------------
2890 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2891 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2892 | cause an exception. Otherwise, the comparison is performed according to the
2893 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2894 *----------------------------------------------------------------------------*/
2895
2896 flag float128_le_quiet( float128 a, float128 b )
2897 {
2898 flag aSign, bSign;
2899
2900 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
2901 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2902 || ( ( extractFloat128Exp( b ) == 0x7FFF )
2903 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2904 ) {
2905 if ( float128_is_signaling_nan( a )
2906 || float128_is_signaling_nan( b ) ) {
2907 float_raise( float_flag_invalid );
2908 }
2909 return 0;
2910 }
2911 aSign = extractFloat128Sign( a );
2912 bSign = extractFloat128Sign( b );
2913 if ( aSign != bSign ) {
2914 return
2915 aSign
2916 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2917 == 0 );
2918 }
2919 return
2920 aSign ? le128( b.high, b.low, a.high, a.low )
2921 : le128( a.high, a.low, b.high, b.low );
2922
2923 }
2924
2925 /*----------------------------------------------------------------------------
2926 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2927 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2928 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2929 | Standard for Binary Floating-Point Arithmetic.
2930 *----------------------------------------------------------------------------*/
2931
2932 flag float128_lt_quiet( float128 a, float128 b )
2933 {
2934 flag aSign, bSign;
2935
2936 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
2937 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2938 || ( ( extractFloat128Exp( b ) == 0x7FFF )
2939 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2940 ) {
2941 if ( float128_is_signaling_nan( a )
2942 || float128_is_signaling_nan( b ) ) {
2943 float_raise( float_flag_invalid );
2944 }
2945 return 0;
2946 }
2947 aSign = extractFloat128Sign( a );
2948 bSign = extractFloat128Sign( b );
2949 if ( aSign != bSign ) {
2950 return
2951 aSign
2952 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2953 != 0 );
2954 }
2955 return
2956 aSign ? lt128( b.high, b.low, a.high, a.low )
2957 : lt128( a.high, a.low, b.high, b.low );
2958
2959 }
2960
2961 #endif
2962