2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
34 #include "softfloat.h"
36 /*----------------------------------------------------------------------------
37 | Primitive arithmetic functions, including multi-word arithmetic, and
38 | division and square root approximations. (Can be specialized to target if
40 *----------------------------------------------------------------------------*/
41 #include "softfloat-macros"
43 /*----------------------------------------------------------------------------
44 | Functions and definitions to determine: (1) whether tininess for underflow
45 | is detected before or after rounding by default, (2) what (if anything)
46 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
47 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
48 | are propagated from function inputs to output. These details are target-
50 *----------------------------------------------------------------------------*/
51 #include "softfloat-specialize"
55 /*----------------------------------------------------------------------------
56 | Returns the fraction bits of the extended double-precision floating-point
58 *----------------------------------------------------------------------------*/
60 INLINE bits64
extractFloatx80Frac( floatx80 a
)
67 /*----------------------------------------------------------------------------
68 | Returns the exponent bits of the extended double-precision floating-point
70 *----------------------------------------------------------------------------*/
72 INLINE int32
extractFloatx80Exp( floatx80 a
)
75 return a
.high
& 0x7FFF;
79 /*----------------------------------------------------------------------------
80 | Returns the sign bit of the extended double-precision floating-point value
82 *----------------------------------------------------------------------------*/
84 INLINE flag
extractFloatx80Sign( floatx80 a
)
91 /*----------------------------------------------------------------------------
92 | Normalizes the subnormal extended double-precision floating-point value
93 | represented by the denormalized significand `aSig'. The normalized exponent
94 | and significand are stored at the locations pointed to by `zExpPtr' and
95 | `zSigPtr', respectively.
96 *----------------------------------------------------------------------------*/
99 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
103 shiftCount
= countLeadingZeros64( aSig
);
104 *zSigPtr
= aSig
<<shiftCount
;
105 *zExpPtr
= 1 - shiftCount
;
109 /*----------------------------------------------------------------------------
110 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
111 | extended double-precision floating-point value, returning the result.
112 *----------------------------------------------------------------------------*/
114 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
119 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
124 /*----------------------------------------------------------------------------
125 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
126 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
127 | and returns the proper extended double-precision floating-point value
128 | corresponding to the abstract input. Ordinarily, the abstract value is
129 | rounded and packed into the extended double-precision format, with the
130 | inexact exception raised if the abstract input cannot be represented
131 | exactly. However, if the abstract value is too large, the overflow and
132 | inexact exceptions are raised and an infinity or maximal finite value is
133 | returned. If the abstract value is too small, the input value is rounded to
134 | a subnormal number, and the underflow and inexact exceptions are raised if
135 | the abstract input cannot be represented exactly as a subnormal extended
136 | double-precision floating-point number.
137 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
138 | number of bits as single or double precision, respectively. Otherwise, the
139 | result is rounded to the full precision of the extended double-precision
141 | The input significand must be normalized or smaller. If the input
142 | significand is not normalized, `zExp' must be 0; in that case, the result
143 | returned is a subnormal number, and it must not require rounding. The
144 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
145 | Floating-Point Arithmetic.
146 *----------------------------------------------------------------------------*/
149 roundAndPackFloatx80(
150 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
154 flag roundNearestEven
, increment
, isTiny
;
155 int64 roundIncrement
, roundMask
, roundBits
;
157 roundingMode
= float_rounding_mode
;
158 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
159 if ( roundingPrecision
== 80 ) goto precision80
;
160 if ( roundingPrecision
== 64 ) {
161 roundIncrement
= LIT64( 0x0000000000000400 );
162 roundMask
= LIT64( 0x00000000000007FF );
164 else if ( roundingPrecision
== 32 ) {
165 roundIncrement
= LIT64( 0x0000008000000000 );
166 roundMask
= LIT64( 0x000000FFFFFFFFFF );
171 zSig0
|= ( zSig1
!= 0 );
172 if ( ! roundNearestEven
) {
173 if ( roundingMode
== float_round_to_zero
) {
177 roundIncrement
= roundMask
;
179 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
182 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
186 roundBits
= zSig0
& roundMask
;
187 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
188 if ( ( 0x7FFE < zExp
)
189 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
195 ( float_detect_tininess
== float_tininess_before_rounding
)
197 || ( zSig0
<= zSig0
+ roundIncrement
);
198 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
200 roundBits
= zSig0
& roundMask
;
201 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
202 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
203 zSig0
+= roundIncrement
;
204 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
205 roundIncrement
= roundMask
+ 1;
206 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
207 roundMask
|= roundIncrement
;
209 zSig0
&= ~ roundMask
;
210 return packFloatx80( zSign
, zExp
, zSig0
);
213 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
214 zSig0
+= roundIncrement
;
215 if ( zSig0
< roundIncrement
) {
217 zSig0
= LIT64( 0x8000000000000000 );
219 roundIncrement
= roundMask
+ 1;
220 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
221 roundMask
|= roundIncrement
;
223 zSig0
&= ~ roundMask
;
224 if ( zSig0
== 0 ) zExp
= 0;
225 return packFloatx80( zSign
, zExp
, zSig0
);
227 increment
= ( (sbits64
) zSig1
< 0 );
228 if ( ! roundNearestEven
) {
229 if ( roundingMode
== float_round_to_zero
) {
234 increment
= ( roundingMode
== float_round_down
) && zSig1
;
237 increment
= ( roundingMode
== float_round_up
) && zSig1
;
241 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
242 if ( ( 0x7FFE < zExp
)
243 || ( ( zExp
== 0x7FFE )
244 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
250 float_raise( float_flag_overflow
| float_flag_inexact
);
251 if ( ( roundingMode
== float_round_to_zero
)
252 || ( zSign
&& ( roundingMode
== float_round_up
) )
253 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
255 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
257 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
261 ( float_detect_tininess
== float_tininess_before_rounding
)
264 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
265 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
267 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
268 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
269 if ( roundNearestEven
) {
270 increment
= ( (sbits64
) zSig1
< 0 );
274 increment
= ( roundingMode
== float_round_down
) && zSig1
;
277 increment
= ( roundingMode
== float_round_up
) && zSig1
;
283 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
284 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
286 return packFloatx80( zSign
, zExp
, zSig0
);
289 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
294 zSig0
= LIT64( 0x8000000000000000 );
297 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
301 if ( zSig0
== 0 ) zExp
= 0;
303 return packFloatx80( zSign
, zExp
, zSig0
);
307 /*----------------------------------------------------------------------------
308 | Takes an abstract floating-point value having sign `zSign', exponent
309 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
310 | and returns the proper extended double-precision floating-point value
311 | corresponding to the abstract input. This routine is just like
312 | `roundAndPackFloatx80' except that the input significand does not have to be
314 *----------------------------------------------------------------------------*/
317 normalizeRoundAndPackFloatx80(
318 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
328 shiftCount
= countLeadingZeros64( zSig0
);
329 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
332 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
340 /*----------------------------------------------------------------------------
341 | Returns the least-significant 64 fraction bits of the quadruple-precision
342 | floating-point value `a'.
343 *----------------------------------------------------------------------------*/
345 INLINE bits64
extractFloat128Frac1( float128 a
)
352 /*----------------------------------------------------------------------------
353 | Returns the most-significant 48 fraction bits of the quadruple-precision
354 | floating-point value `a'.
355 *----------------------------------------------------------------------------*/
357 INLINE bits64
extractFloat128Frac0( float128 a
)
360 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
364 /*----------------------------------------------------------------------------
365 | Returns the exponent bits of the quadruple-precision floating-point value
367 *----------------------------------------------------------------------------*/
369 INLINE int32
extractFloat128Exp( float128 a
)
372 return ( a
.high
>>48 ) & 0x7FFF;
376 /*----------------------------------------------------------------------------
377 | Returns the sign bit of the quadruple-precision floating-point value `a'.
378 *----------------------------------------------------------------------------*/
380 INLINE flag
extractFloat128Sign( float128 a
)
387 /*----------------------------------------------------------------------------
388 | Normalizes the subnormal quadruple-precision floating-point value
389 | represented by the denormalized significand formed by the concatenation of
390 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
391 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
392 | significand are stored at the location pointed to by `zSig0Ptr', and the
393 | least significant 64 bits of the normalized significand are stored at the
394 | location pointed to by `zSig1Ptr'.
395 *----------------------------------------------------------------------------*/
398 normalizeFloat128Subnormal(
409 shiftCount
= countLeadingZeros64( aSig1
) - 15;
410 if ( shiftCount
< 0 ) {
411 *zSig0Ptr
= aSig1
>>( - shiftCount
);
412 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
415 *zSig0Ptr
= aSig1
<<shiftCount
;
418 *zExpPtr
= - shiftCount
- 63;
421 shiftCount
= countLeadingZeros64( aSig0
) - 15;
422 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
423 *zExpPtr
= 1 - shiftCount
;
428 /*----------------------------------------------------------------------------
429 | Packs the sign `zSign', the exponent `zExp', and the significand formed
430 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
431 | floating-point value, returning the result. After being shifted into the
432 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
433 | added together to form the most significant 32 bits of the result. This
434 | means that any integer portion of `zSig0' will be added into the exponent.
435 | Since a properly normalized significand will have an integer portion equal
436 | to 1, the `zExp' input should be 1 less than the desired result exponent
437 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
439 *----------------------------------------------------------------------------*/
442 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
447 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
452 /*----------------------------------------------------------------------------
453 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
454 | and extended significand formed by the concatenation of `zSig0', `zSig1',
455 | and `zSig2', and returns the proper quadruple-precision floating-point value
456 | corresponding to the abstract input. Ordinarily, the abstract value is
457 | simply rounded and packed into the quadruple-precision format, with the
458 | inexact exception raised if the abstract input cannot be represented
459 | exactly. However, if the abstract value is too large, the overflow and
460 | inexact exceptions are raised and an infinity or maximal finite value is
461 | returned. If the abstract value is too small, the input value is rounded to
462 | a subnormal number, and the underflow and inexact exceptions are raised if
463 | the abstract input cannot be represented exactly as a subnormal quadruple-
464 | precision floating-point number.
465 | The input significand must be normalized or smaller. If the input
466 | significand is not normalized, `zExp' must be 0; in that case, the result
467 | returned is a subnormal number, and it must not require rounding. In the
468 | usual case that the input significand is normalized, `zExp' must be 1 less
469 | than the ``true'' floating-point exponent. The handling of underflow and
470 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
471 *----------------------------------------------------------------------------*/
474 roundAndPackFloat128(
475 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
478 flag roundNearestEven
, increment
, isTiny
;
480 roundingMode
= float_rounding_mode
;
481 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
482 increment
= ( (sbits64
) zSig2
< 0 );
483 if ( ! roundNearestEven
) {
484 if ( roundingMode
== float_round_to_zero
) {
489 increment
= ( roundingMode
== float_round_down
) && zSig2
;
492 increment
= ( roundingMode
== float_round_up
) && zSig2
;
496 if ( 0x7FFD <= (bits32
) zExp
) {
497 if ( ( 0x7FFD < zExp
)
498 || ( ( zExp
== 0x7FFD )
500 LIT64( 0x0001FFFFFFFFFFFF ),
501 LIT64( 0xFFFFFFFFFFFFFFFF ),
508 float_raise( float_flag_overflow
| float_flag_inexact
);
509 if ( ( roundingMode
== float_round_to_zero
)
510 || ( zSign
&& ( roundingMode
== float_round_up
) )
511 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
517 LIT64( 0x0000FFFFFFFFFFFF ),
518 LIT64( 0xFFFFFFFFFFFFFFFF )
521 return packFloat128( zSign
, 0x7FFF, 0, 0 );
525 ( float_detect_tininess
== float_tininess_before_rounding
)
531 LIT64( 0x0001FFFFFFFFFFFF ),
532 LIT64( 0xFFFFFFFFFFFFFFFF )
534 shift128ExtraRightJamming(
535 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
537 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
538 if ( roundNearestEven
) {
539 increment
= ( (sbits64
) zSig2
< 0 );
543 increment
= ( roundingMode
== float_round_down
) && zSig2
;
546 increment
= ( roundingMode
== float_round_up
) && zSig2
;
551 if ( zSig2
) float_exception_flags
|= float_flag_inexact
;
553 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
554 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
557 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
559 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
563 /*----------------------------------------------------------------------------
564 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
565 | and significand formed by the concatenation of `zSig0' and `zSig1', and
566 | returns the proper quadruple-precision floating-point value corresponding
567 | to the abstract input. This routine is just like `roundAndPackFloat128'
568 | except that the input significand has fewer bits and does not have to be
569 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
571 *----------------------------------------------------------------------------*/
574 normalizeRoundAndPackFloat128(
575 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
585 shiftCount
= countLeadingZeros64( zSig0
) - 15;
586 if ( 0 <= shiftCount
) {
588 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
591 shift128ExtraRightJamming(
592 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
595 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
603 /*----------------------------------------------------------------------------
604 | Returns the result of converting the 32-bit two's complement integer `a'
605 | to the extended double-precision floating-point format. The conversion
606 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
608 *----------------------------------------------------------------------------*/
610 floatx80
int32_to_floatx80( int32 a
)
617 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
619 absA
= zSign
? - a
: a
;
620 shiftCount
= countLeadingZeros32( absA
) + 32;
622 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
630 /*----------------------------------------------------------------------------
631 | Returns the result of converting the 32-bit two's complement integer `a' to
632 | the quadruple-precision floating-point format. The conversion is performed
633 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
634 *----------------------------------------------------------------------------*/
636 float128
int32_to_float128( int32 a
)
643 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
645 absA
= zSign
? - a
: a
;
646 shiftCount
= countLeadingZeros32( absA
) + 17;
648 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
656 /*----------------------------------------------------------------------------
657 | Returns the result of converting the 64-bit two's complement integer `a'
658 | to the extended double-precision floating-point format. The conversion
659 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
661 *----------------------------------------------------------------------------*/
663 floatx80
int64_to_floatx80( int64 a
)
669 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
671 absA
= zSign
? - a
: a
;
672 shiftCount
= countLeadingZeros64( absA
);
673 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
681 /*----------------------------------------------------------------------------
682 | Returns the result of converting the 64-bit two's complement integer `a' to
683 | the quadruple-precision floating-point format. The conversion is performed
684 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
685 *----------------------------------------------------------------------------*/
687 float128
int64_to_float128( int64 a
)
695 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
697 absA
= zSign
? - a
: a
;
698 shiftCount
= countLeadingZeros64( absA
) + 49;
699 zExp
= 0x406E - shiftCount
;
700 if ( 64 <= shiftCount
) {
709 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
710 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
718 /*----------------------------------------------------------------------------
719 | Returns the result of converting the single-precision floating-point value
720 | `a' to the extended double-precision floating-point format. The conversion
721 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
723 *----------------------------------------------------------------------------*/
725 floatx80
float32_to_floatx80( float32 a
)
731 aSig
= extractFloat32Frac( a
);
732 aExp
= extractFloat32Exp( a
);
733 aSign
= extractFloat32Sign( a
);
734 if ( aExp
== 0xFF ) {
735 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
736 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
739 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
740 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
743 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
751 /*----------------------------------------------------------------------------
752 | Returns the result of converting the single-precision floating-point value
753 | `a' to the double-precision floating-point format. The conversion is
754 | performed according to the IEC/IEEE Standard for Binary Floating-Point
756 *----------------------------------------------------------------------------*/
758 float128
float32_to_float128( float32 a
)
764 aSig
= extractFloat32Frac( a
);
765 aExp
= extractFloat32Exp( a
);
766 aSign
= extractFloat32Sign( a
);
767 if ( aExp
== 0xFF ) {
768 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
769 return packFloat128( aSign
, 0x7FFF, 0, 0 );
772 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
773 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
776 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
784 /*----------------------------------------------------------------------------
785 | Returns the result of converting the double-precision floating-point value
786 | `a' to the extended double-precision floating-point format. The conversion
787 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
789 *----------------------------------------------------------------------------*/
791 floatx80
float64_to_floatx80( float64 a
)
797 aSig
= extractFloat64Frac( a
);
798 aExp
= extractFloat64Exp( a
);
799 aSign
= extractFloat64Sign( a
);
800 if ( aExp
== 0x7FF ) {
801 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
802 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
805 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
806 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
810 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
818 /*----------------------------------------------------------------------------
819 | Returns the result of converting the double-precision floating-point value
820 | `a' to the quadruple-precision floating-point format. The conversion is
821 | performed according to the IEC/IEEE Standard for Binary Floating-Point
823 *----------------------------------------------------------------------------*/
825 float128
float64_to_float128( float64 a
)
829 bits64 aSig
, zSig0
, zSig1
;
831 aSig
= extractFloat64Frac( a
);
832 aExp
= extractFloat64Exp( a
);
833 aSign
= extractFloat64Sign( a
);
834 if ( aExp
== 0x7FF ) {
835 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
836 return packFloat128( aSign
, 0x7FFF, 0, 0 );
839 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
840 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
843 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
844 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
852 /*----------------------------------------------------------------------------
853 | Returns the result of converting the extended double-precision floating-
854 | point value `a' to the 32-bit two's complement integer format. The
855 | conversion is performed according to the IEC/IEEE Standard for Binary
856 | Floating-Point Arithmetic---which means in particular that the conversion
857 | is rounded according to the current rounding mode. If `a' is a NaN, the
858 | largest positive integer is returned. Otherwise, if the conversion
859 | overflows, the largest integer with the same sign as `a' is returned.
860 *----------------------------------------------------------------------------*/
862 int32
floatx80_to_int32( floatx80 a
)
865 int32 aExp
, shiftCount
;
868 aSig
= extractFloatx80Frac( a
);
869 aExp
= extractFloatx80Exp( a
);
870 aSign
= extractFloatx80Sign( a
);
871 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
872 shiftCount
= 0x4037 - aExp
;
873 if ( shiftCount
<= 0 ) shiftCount
= 1;
874 shift64RightJamming( aSig
, shiftCount
, &aSig
);
875 return roundAndPackInt32( aSign
, aSig
);
879 /*----------------------------------------------------------------------------
880 | Returns the result of converting the extended double-precision floating-
881 | point value `a' to the 32-bit two's complement integer format. The
882 | conversion is performed according to the IEC/IEEE Standard for Binary
883 | Floating-Point Arithmetic, except that the conversion is always rounded
884 | toward zero. If `a' is a NaN, the largest positive integer is returned.
885 | Otherwise, if the conversion overflows, the largest integer with the same
886 | sign as `a' is returned.
887 *----------------------------------------------------------------------------*/
889 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
892 int32 aExp
, shiftCount
;
893 bits64 aSig
, savedASig
;
896 aSig
= extractFloatx80Frac( a
);
897 aExp
= extractFloatx80Exp( a
);
898 aSign
= extractFloatx80Sign( a
);
899 if ( 0x401E < aExp
) {
900 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
903 else if ( aExp
< 0x3FFF ) {
904 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
907 shiftCount
= 0x403E - aExp
;
911 if ( aSign
) z
= - z
;
912 if ( ( z
< 0 ) ^ aSign
) {
914 float_raise( float_flag_invalid
);
915 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
917 if ( ( aSig
<<shiftCount
) != savedASig
) {
918 float_exception_flags
|= float_flag_inexact
;
924 /*----------------------------------------------------------------------------
925 | Returns the result of converting the extended double-precision floating-
926 | point value `a' to the 64-bit two's complement integer format. The
927 | conversion is performed according to the IEC/IEEE Standard for Binary
928 | Floating-Point Arithmetic---which means in particular that the conversion
929 | is rounded according to the current rounding mode. If `a' is a NaN,
930 | the largest positive integer is returned. Otherwise, if the conversion
931 | overflows, the largest integer with the same sign as `a' is returned.
932 *----------------------------------------------------------------------------*/
934 int64
floatx80_to_int64( floatx80 a
)
937 int32 aExp
, shiftCount
;
938 bits64 aSig
, aSigExtra
;
940 aSig
= extractFloatx80Frac( a
);
941 aExp
= extractFloatx80Exp( a
);
942 aSign
= extractFloatx80Sign( a
);
943 shiftCount
= 0x403E - aExp
;
944 if ( shiftCount
<= 0 ) {
946 float_raise( float_flag_invalid
);
948 || ( ( aExp
== 0x7FFF )
949 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
951 return LIT64( 0x7FFFFFFFFFFFFFFF );
953 return (sbits64
) LIT64( 0x8000000000000000 );
958 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
960 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
964 /*----------------------------------------------------------------------------
965 | Returns the result of converting the extended double-precision floating-
966 | point value `a' to the 64-bit two's complement integer format. The
967 | conversion is performed according to the IEC/IEEE Standard for Binary
968 | Floating-Point Arithmetic, except that the conversion is always rounded
969 | toward zero. If `a' is a NaN, the largest positive integer is returned.
970 | Otherwise, if the conversion overflows, the largest integer with the same
971 | sign as `a' is returned.
972 *----------------------------------------------------------------------------*/
974 int64
floatx80_to_int64_round_to_zero( floatx80 a
)
977 int32 aExp
, shiftCount
;
981 aSig
= extractFloatx80Frac( a
);
982 aExp
= extractFloatx80Exp( a
);
983 aSign
= extractFloatx80Sign( a
);
984 shiftCount
= aExp
- 0x403E;
985 if ( 0 <= shiftCount
) {
986 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
987 if ( ( a
.high
!= 0xC03E ) || aSig
) {
988 float_raise( float_flag_invalid
);
989 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
990 return LIT64( 0x7FFFFFFFFFFFFFFF );
993 return (sbits64
) LIT64( 0x8000000000000000 );
995 else if ( aExp
< 0x3FFF ) {
996 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
999 z
= aSig
>>( - shiftCount
);
1000 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
1001 float_exception_flags
|= float_flag_inexact
;
1003 if ( aSign
) z
= - z
;
1008 /*----------------------------------------------------------------------------
1009 | Returns the result of converting the extended double-precision floating-
1010 | point value `a' to the single-precision floating-point format. The
1011 | conversion is performed according to the IEC/IEEE Standard for Binary
1012 | Floating-Point Arithmetic.
1013 *----------------------------------------------------------------------------*/
1015 float32
floatx80_to_float32( floatx80 a
)
1021 aSig
= extractFloatx80Frac( a
);
1022 aExp
= extractFloatx80Exp( a
);
1023 aSign
= extractFloatx80Sign( a
);
1024 if ( aExp
== 0x7FFF ) {
1025 if ( (bits64
) ( aSig
<<1 ) ) {
1026 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
1028 return packFloat32( aSign
, 0xFF, 0 );
1030 shift64RightJamming( aSig
, 33, &aSig
);
1031 if ( aExp
|| aSig
) aExp
-= 0x3F81;
1032 return roundAndPackFloat32( aSign
, aExp
, aSig
);
1036 /*----------------------------------------------------------------------------
1037 | Returns the result of converting the extended double-precision floating-
1038 | point value `a' to the double-precision floating-point format. The
1039 | conversion is performed according to the IEC/IEEE Standard for Binary
1040 | Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1043 float64
floatx80_to_float64( floatx80 a
)
1049 aSig
= extractFloatx80Frac( a
);
1050 aExp
= extractFloatx80Exp( a
);
1051 aSign
= extractFloatx80Sign( a
);
1052 if ( aExp
== 0x7FFF ) {
1053 if ( (bits64
) ( aSig
<<1 ) ) {
1054 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
1056 return packFloat64( aSign
, 0x7FF, 0 );
1058 shift64RightJamming( aSig
, 1, &zSig
);
1059 if ( aExp
|| aSig
) aExp
-= 0x3C01;
1060 return roundAndPackFloat64( aSign
, aExp
, zSig
);
1066 /*----------------------------------------------------------------------------
1067 | Returns the result of converting the extended double-precision floating-
1068 | point value `a' to the quadruple-precision floating-point format. The
1069 | conversion is performed according to the IEC/IEEE Standard for Binary
1070 | Floating-Point Arithmetic.
1071 *----------------------------------------------------------------------------*/
1073 float128
floatx80_to_float128( floatx80 a
)
1077 bits64 aSig
, zSig0
, zSig1
;
1079 aSig
= extractFloatx80Frac( a
);
1080 aExp
= extractFloatx80Exp( a
);
1081 aSign
= extractFloatx80Sign( a
);
1082 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
1083 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
1085 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
1086 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
1092 /*----------------------------------------------------------------------------
1093 | Rounds the extended double-precision floating-point value `a' to an integer,
1094 | and returns the result as an extended quadruple-precision floating-point
1095 | value. The operation is performed according to the IEC/IEEE Standard for
1096 | Binary Floating-Point Arithmetic.
1097 *----------------------------------------------------------------------------*/
1099 floatx80
floatx80_round_to_int( floatx80 a
)
1103 bits64 lastBitMask
, roundBitsMask
;
1107 aExp
= extractFloatx80Exp( a
);
1108 if ( 0x403E <= aExp
) {
1109 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
1110 return propagateFloatx80NaN( a
, a
);
1114 if ( aExp
< 0x3FFF ) {
1116 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
1119 float_exception_flags
|= float_flag_inexact
;
1120 aSign
= extractFloatx80Sign( a
);
1121 switch ( float_rounding_mode
) {
1122 case float_round_nearest_even
:
1123 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
1126 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
1129 case float_round_down
:
1132 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
1133 : packFloatx80( 0, 0, 0 );
1134 case float_round_up
:
1136 aSign
? packFloatx80( 1, 0, 0 )
1137 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
1139 return packFloatx80( aSign
, 0, 0 );
1142 lastBitMask
<<= 0x403E - aExp
;
1143 roundBitsMask
= lastBitMask
- 1;
1145 roundingMode
= float_rounding_mode
;
1146 if ( roundingMode
== float_round_nearest_even
) {
1147 z
.low
+= lastBitMask
>>1;
1148 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
1150 else if ( roundingMode
!= float_round_to_zero
) {
1151 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1152 z
.low
+= roundBitsMask
;
1155 z
.low
&= ~ roundBitsMask
;
1158 z
.low
= LIT64( 0x8000000000000000 );
1160 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
1165 /*----------------------------------------------------------------------------
1166 | Returns the result of adding the absolute values of the extended double-
1167 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
1168 | negated before being returned. `zSign' is ignored if the result is a NaN.
1169 | The addition is performed according to the IEC/IEEE Standard for Binary
1170 | Floating-Point Arithmetic.
1171 *----------------------------------------------------------------------------*/
1173 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
1175 int32 aExp
, bExp
, zExp
;
1176 bits64 aSig
, bSig
, zSig0
, zSig1
;
1179 aSig
= extractFloatx80Frac( a
);
1180 aExp
= extractFloatx80Exp( a
);
1181 bSig
= extractFloatx80Frac( b
);
1182 bExp
= extractFloatx80Exp( b
);
1183 expDiff
= aExp
- bExp
;
1184 if ( 0 < expDiff
) {
1185 if ( aExp
== 0x7FFF ) {
1186 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1189 if ( bExp
== 0 ) --expDiff
;
1190 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
1193 else if ( expDiff
< 0 ) {
1194 if ( bExp
== 0x7FFF ) {
1195 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1196 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1198 if ( aExp
== 0 ) ++expDiff
;
1199 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
1203 if ( aExp
== 0x7FFF ) {
1204 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
1205 return propagateFloatx80NaN( a
, b
);
1210 zSig0
= aSig
+ bSig
;
1212 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
1218 zSig0
= aSig
+ bSig
;
1219 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
1221 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
1222 zSig0
|= LIT64( 0x8000000000000000 );
1226 roundAndPackFloatx80(
1227 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
1231 /*----------------------------------------------------------------------------
1232 | Returns the result of subtracting the absolute values of the extended
1233 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
1234 | difference is negated before being returned. `zSign' is ignored if the
1235 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1236 | Standard for Binary Floating-Point Arithmetic.
1237 *----------------------------------------------------------------------------*/
1239 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
1241 int32 aExp
, bExp
, zExp
;
1242 bits64 aSig
, bSig
, zSig0
, zSig1
;
1246 aSig
= extractFloatx80Frac( a
);
1247 aExp
= extractFloatx80Exp( a
);
1248 bSig
= extractFloatx80Frac( b
);
1249 bExp
= extractFloatx80Exp( b
);
1250 expDiff
= aExp
- bExp
;
1251 if ( 0 < expDiff
) goto aExpBigger
;
1252 if ( expDiff
< 0 ) goto bExpBigger
;
1253 if ( aExp
== 0x7FFF ) {
1254 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
1255 return propagateFloatx80NaN( a
, b
);
1257 float_raise( float_flag_invalid
);
1258 z
.low
= floatx80_default_nan_low
;
1259 z
.high
= floatx80_default_nan_high
;
1267 if ( bSig
< aSig
) goto aBigger
;
1268 if ( aSig
< bSig
) goto bBigger
;
1269 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
1271 if ( bExp
== 0x7FFF ) {
1272 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1273 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
1275 if ( aExp
== 0 ) ++expDiff
;
1276 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
1278 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
1281 goto normalizeRoundAndPack
;
1283 if ( aExp
== 0x7FFF ) {
1284 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1287 if ( bExp
== 0 ) --expDiff
;
1288 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
1290 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
1292 normalizeRoundAndPack
:
1294 normalizeRoundAndPackFloatx80(
1295 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
1299 /*----------------------------------------------------------------------------
1300 | Returns the result of adding the extended double-precision floating-point
1301 | values `a' and `b'. The operation is performed according to the IEC/IEEE
1302 | Standard for Binary Floating-Point Arithmetic.
1303 *----------------------------------------------------------------------------*/
1305 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
1309 aSign
= extractFloatx80Sign( a
);
1310 bSign
= extractFloatx80Sign( b
);
1311 if ( aSign
== bSign
) {
1312 return addFloatx80Sigs( a
, b
, aSign
);
1315 return subFloatx80Sigs( a
, b
, aSign
);
1320 /*----------------------------------------------------------------------------
1321 | Returns the result of subtracting the extended double-precision floating-
1322 | point values `a' and `b'. The operation is performed according to the
1323 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1324 *----------------------------------------------------------------------------*/
1326 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
1330 aSign
= extractFloatx80Sign( a
);
1331 bSign
= extractFloatx80Sign( b
);
1332 if ( aSign
== bSign
) {
1333 return subFloatx80Sigs( a
, b
, aSign
);
1336 return addFloatx80Sigs( a
, b
, aSign
);
1341 /*----------------------------------------------------------------------------
1342 | Returns the result of multiplying the extended double-precision floating-
1343 | point values `a' and `b'. The operation is performed according to the
1344 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1345 *----------------------------------------------------------------------------*/
1347 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
1349 flag aSign
, bSign
, zSign
;
1350 int32 aExp
, bExp
, zExp
;
1351 bits64 aSig
, bSig
, zSig0
, zSig1
;
1354 aSig
= extractFloatx80Frac( a
);
1355 aExp
= extractFloatx80Exp( a
);
1356 aSign
= extractFloatx80Sign( a
);
1357 bSig
= extractFloatx80Frac( b
);
1358 bExp
= extractFloatx80Exp( b
);
1359 bSign
= extractFloatx80Sign( b
);
1360 zSign
= aSign
^ bSign
;
1361 if ( aExp
== 0x7FFF ) {
1362 if ( (bits64
) ( aSig
<<1 )
1363 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
1364 return propagateFloatx80NaN( a
, b
);
1366 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
1367 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1369 if ( bExp
== 0x7FFF ) {
1370 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1371 if ( ( aExp
| aSig
) == 0 ) {
1373 float_raise( float_flag_invalid
);
1374 z
.low
= floatx80_default_nan_low
;
1375 z
.high
= floatx80_default_nan_high
;
1378 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1381 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
1382 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
1385 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
1386 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
1388 zExp
= aExp
+ bExp
- 0x3FFE;
1389 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
1390 if ( 0 < (sbits64
) zSig0
) {
1391 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
1395 roundAndPackFloatx80(
1396 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
1400 /*----------------------------------------------------------------------------
1401 | Returns the result of dividing the extended double-precision floating-point
1402 | value `a' by the corresponding value `b'. The operation is performed
1403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1404 *----------------------------------------------------------------------------*/
1406 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
1408 flag aSign
, bSign
, zSign
;
1409 int32 aExp
, bExp
, zExp
;
1410 bits64 aSig
, bSig
, zSig0
, zSig1
;
1411 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
1414 aSig
= extractFloatx80Frac( a
);
1415 aExp
= extractFloatx80Exp( a
);
1416 aSign
= extractFloatx80Sign( a
);
1417 bSig
= extractFloatx80Frac( b
);
1418 bExp
= extractFloatx80Exp( b
);
1419 bSign
= extractFloatx80Sign( b
);
1420 zSign
= aSign
^ bSign
;
1421 if ( aExp
== 0x7FFF ) {
1422 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1423 if ( bExp
== 0x7FFF ) {
1424 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1427 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1429 if ( bExp
== 0x7FFF ) {
1430 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1431 return packFloatx80( zSign
, 0, 0 );
1435 if ( ( aExp
| aSig
) == 0 ) {
1437 float_raise( float_flag_invalid
);
1438 z
.low
= floatx80_default_nan_low
;
1439 z
.high
= floatx80_default_nan_high
;
1442 float_raise( float_flag_divbyzero
);
1443 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1445 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
1448 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
1449 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
1451 zExp
= aExp
- bExp
+ 0x3FFE;
1453 if ( bSig
<= aSig
) {
1454 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
1457 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
1458 mul64To128( bSig
, zSig0
, &term0
, &term1
);
1459 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
1460 while ( (sbits64
) rem0
< 0 ) {
1462 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
1464 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
1465 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
1466 mul64To128( bSig
, zSig1
, &term1
, &term2
);
1467 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
1468 while ( (sbits64
) rem1
< 0 ) {
1470 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
1472 zSig1
|= ( ( rem1
| rem2
) != 0 );
1475 roundAndPackFloatx80(
1476 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
1480 /*----------------------------------------------------------------------------
1481 | Returns the remainder of the extended double-precision floating-point value
1482 | `a' with respect to the corresponding value `b'. The operation is performed
1483 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1484 *----------------------------------------------------------------------------*/
1486 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
1488 flag aSign
, bSign
, zSign
;
1489 int32 aExp
, bExp
, expDiff
;
1490 bits64 aSig0
, aSig1
, bSig
;
1491 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
1494 aSig0
= extractFloatx80Frac( a
);
1495 aExp
= extractFloatx80Exp( a
);
1496 aSign
= extractFloatx80Sign( a
);
1497 bSig
= extractFloatx80Frac( b
);
1498 bExp
= extractFloatx80Exp( b
);
1499 bSign
= extractFloatx80Sign( b
);
1500 if ( aExp
== 0x7FFF ) {
1501 if ( (bits64
) ( aSig0
<<1 )
1502 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
1503 return propagateFloatx80NaN( a
, b
);
1507 if ( bExp
== 0x7FFF ) {
1508 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
1514 float_raise( float_flag_invalid
);
1515 z
.low
= floatx80_default_nan_low
;
1516 z
.high
= floatx80_default_nan_high
;
1519 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
1522 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
1523 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
1525 bSig
|= LIT64( 0x8000000000000000 );
1527 expDiff
= aExp
- bExp
;
1529 if ( expDiff
< 0 ) {
1530 if ( expDiff
< -1 ) return a
;
1531 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
1534 q
= ( bSig
<= aSig0
);
1535 if ( q
) aSig0
-= bSig
;
1537 while ( 0 < expDiff
) {
1538 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
1539 q
= ( 2 < q
) ? q
- 2 : 0;
1540 mul64To128( bSig
, q
, &term0
, &term1
);
1541 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
1542 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
1546 if ( 0 < expDiff
) {
1547 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
1548 q
= ( 2 < q
) ? q
- 2 : 0;
1550 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
1551 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
1552 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
1553 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
1555 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
1562 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
1563 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
1564 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
1567 aSig0
= alternateASig0
;
1568 aSig1
= alternateASig1
;
1572 normalizeRoundAndPackFloatx80(
1573 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
1577 /*----------------------------------------------------------------------------
1578 | Returns the square root of the extended double-precision floating-point
1579 | value `a'. The operation is performed according to the IEC/IEEE Standard
1580 | for Binary Floating-Point Arithmetic.
1581 *----------------------------------------------------------------------------*/
1583 floatx80
floatx80_sqrt( floatx80 a
)
1587 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
1588 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
1591 aSig0
= extractFloatx80Frac( a
);
1592 aExp
= extractFloatx80Exp( a
);
1593 aSign
= extractFloatx80Sign( a
);
1594 if ( aExp
== 0x7FFF ) {
1595 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
1596 if ( ! aSign
) return a
;
1600 if ( ( aExp
| aSig0
) == 0 ) return a
;
1602 float_raise( float_flag_invalid
);
1603 z
.low
= floatx80_default_nan_low
;
1604 z
.high
= floatx80_default_nan_high
;
1608 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
1609 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
1611 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
1612 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
1613 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
1614 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
1615 doubleZSig0
= zSig0
<<1;
1616 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
1617 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
1618 while ( (sbits64
) rem0
< 0 ) {
1621 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
1623 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
1624 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
1625 if ( zSig1
== 0 ) zSig1
= 1;
1626 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
1627 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
1628 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
1629 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
1630 while ( (sbits64
) rem1
< 0 ) {
1632 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
1634 term2
|= doubleZSig0
;
1635 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
1637 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
1639 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
1640 zSig0
|= doubleZSig0
;
1642 roundAndPackFloatx80(
1643 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
1647 /*----------------------------------------------------------------------------
1648 | Returns 1 if the extended double-precision floating-point value `a' is
1649 | equal to the corresponding value `b', and 0 otherwise. The comparison is
1650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1652 *----------------------------------------------------------------------------*/
1654 flag
floatx80_eq( floatx80 a
, floatx80 b
)
1657 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
1658 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
1659 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
1660 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
1662 if ( floatx80_is_signaling_nan( a
)
1663 || floatx80_is_signaling_nan( b
) ) {
1664 float_raise( float_flag_invalid
);
1670 && ( ( a
.high
== b
.high
)
1672 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
1677 /*----------------------------------------------------------------------------
1678 | Returns 1 if the extended double-precision floating-point value `a' is
1679 | less than or equal to the corresponding value `b', and 0 otherwise. The
1680 | comparison is performed according to the IEC/IEEE Standard for Binary
1681 | Floating-Point Arithmetic.
1682 *----------------------------------------------------------------------------*/
1684 flag
floatx80_le( floatx80 a
, floatx80 b
)
1688 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
1689 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
1690 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
1691 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
1693 float_raise( float_flag_invalid
);
1696 aSign
= extractFloatx80Sign( a
);
1697 bSign
= extractFloatx80Sign( b
);
1698 if ( aSign
!= bSign
) {
1701 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
1705 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
1706 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
1710 /*----------------------------------------------------------------------------
1711 | Returns 1 if the extended double-precision floating-point value `a' is
1712 | less than the corresponding value `b', and 0 otherwise. The comparison
1713 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1715 *----------------------------------------------------------------------------*/
1717 flag
floatx80_lt( floatx80 a
, floatx80 b
)
1721 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
1722 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
1723 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
1724 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
1726 float_raise( float_flag_invalid
);
1729 aSign
= extractFloatx80Sign( a
);
1730 bSign
= extractFloatx80Sign( b
);
1731 if ( aSign
!= bSign
) {
1734 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
1738 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
1739 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
1743 /*----------------------------------------------------------------------------
1744 | Returns 1 if the extended double-precision floating-point value `a' is equal
1745 | to the corresponding value `b', and 0 otherwise. The invalid exception is
1746 | raised if either operand is a NaN. Otherwise, the comparison is performed
1747 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1748 *----------------------------------------------------------------------------*/
1750 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
1753 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
1754 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
1755 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
1756 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
1758 float_raise( float_flag_invalid
);
1763 && ( ( a
.high
== b
.high
)
1765 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
1770 /*----------------------------------------------------------------------------
1771 | Returns 1 if the extended double-precision floating-point value `a' is less
1772 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
1773 | do not cause an exception. Otherwise, the comparison is performed according
1774 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1775 *----------------------------------------------------------------------------*/
1777 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
1781 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
1782 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
1783 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
1784 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
1786 if ( floatx80_is_signaling_nan( a
)
1787 || floatx80_is_signaling_nan( b
) ) {
1788 float_raise( float_flag_invalid
);
1792 aSign
= extractFloatx80Sign( a
);
1793 bSign
= extractFloatx80Sign( b
);
1794 if ( aSign
!= bSign
) {
1797 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
1801 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
1802 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
1806 /*----------------------------------------------------------------------------
1807 | Returns 1 if the extended double-precision floating-point value `a' is less
1808 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
1809 | an exception. Otherwise, the comparison is performed according to the
1810 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1811 *----------------------------------------------------------------------------*/
1813 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
1817 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
1818 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
1819 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
1820 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
1822 if ( floatx80_is_signaling_nan( a
)
1823 || floatx80_is_signaling_nan( b
) ) {
1824 float_raise( float_flag_invalid
);
1828 aSign
= extractFloatx80Sign( a
);
1829 bSign
= extractFloatx80Sign( b
);
1830 if ( aSign
!= bSign
) {
1833 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
1837 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
1838 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of converting the quadruple-precision floating-point
1848 | value `a' to the 32-bit two's complement integer format. The conversion
1849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1850 | Arithmetic---which means in particular that the conversion is rounded
1851 | according to the current rounding mode. If `a' is a NaN, the largest
1852 | positive integer is returned. Otherwise, if the conversion overflows, the
1853 | largest integer with the same sign as `a' is returned.
1854 *----------------------------------------------------------------------------*/
1856 int32
float128_to_int32( float128 a
)
1859 int32 aExp
, shiftCount
;
1860 bits64 aSig0
, aSig1
;
1862 aSig1
= extractFloat128Frac1( a
);
1863 aSig0
= extractFloat128Frac0( a
);
1864 aExp
= extractFloat128Exp( a
);
1865 aSign
= extractFloat128Sign( a
);
1866 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
1867 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
1868 aSig0
|= ( aSig1
!= 0 );
1869 shiftCount
= 0x4028 - aExp
;
1870 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
1871 return roundAndPackInt32( aSign
, aSig0
);
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of converting the quadruple-precision floating-point
1877 | value `a' to the 32-bit two's complement integer format. The conversion
1878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1879 | Arithmetic, except that the conversion is always rounded toward zero. If
1880 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1881 | conversion overflows, the largest integer with the same sign as `a' is
1883 *----------------------------------------------------------------------------*/
1885 int32
float128_to_int32_round_to_zero( float128 a
)
1888 int32 aExp
, shiftCount
;
1889 bits64 aSig0
, aSig1
, savedASig
;
1892 aSig1
= extractFloat128Frac1( a
);
1893 aSig0
= extractFloat128Frac0( a
);
1894 aExp
= extractFloat128Exp( a
);
1895 aSign
= extractFloat128Sign( a
);
1896 aSig0
|= ( aSig1
!= 0 );
1897 if ( 0x401E < aExp
) {
1898 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
1901 else if ( aExp
< 0x3FFF ) {
1902 if ( aExp
|| aSig0
) float_exception_flags
|= float_flag_inexact
;
1905 aSig0
|= LIT64( 0x0001000000000000 );
1906 shiftCount
= 0x402F - aExp
;
1908 aSig0
>>= shiftCount
;
1910 if ( aSign
) z
= - z
;
1911 if ( ( z
< 0 ) ^ aSign
) {
1913 float_raise( float_flag_invalid
);
1914 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
1916 if ( ( aSig0
<<shiftCount
) != savedASig
) {
1917 float_exception_flags
|= float_flag_inexact
;
1923 /*----------------------------------------------------------------------------
1924 | Returns the result of converting the quadruple-precision floating-point
1925 | value `a' to the 64-bit two's complement integer format. The conversion
1926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1927 | Arithmetic---which means in particular that the conversion is rounded
1928 | according to the current rounding mode. If `a' is a NaN, the largest
1929 | positive integer is returned. Otherwise, if the conversion overflows, the
1930 | largest integer with the same sign as `a' is returned.
1931 *----------------------------------------------------------------------------*/
1933 int64
float128_to_int64( float128 a
)
1936 int32 aExp
, shiftCount
;
1937 bits64 aSig0
, aSig1
;
1939 aSig1
= extractFloat128Frac1( a
);
1940 aSig0
= extractFloat128Frac0( a
);
1941 aExp
= extractFloat128Exp( a
);
1942 aSign
= extractFloat128Sign( a
);
1943 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
1944 shiftCount
= 0x402F - aExp
;
1945 if ( shiftCount
<= 0 ) {
1946 if ( 0x403E < aExp
) {
1947 float_raise( float_flag_invalid
);
1949 || ( ( aExp
== 0x7FFF )
1950 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
1953 return LIT64( 0x7FFFFFFFFFFFFFFF );
1955 return (sbits64
) LIT64( 0x8000000000000000 );
1957 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
1960 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
1962 return roundAndPackInt64( aSign
, aSig0
, aSig1
);
1966 /*----------------------------------------------------------------------------
1967 | Returns the result of converting the quadruple-precision floating-point
1968 | value `a' to the 64-bit two's complement integer format. The conversion
1969 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1970 | Arithmetic, except that the conversion is always rounded toward zero.
1971 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1972 | the conversion overflows, the largest integer with the same sign as `a' is
1974 *----------------------------------------------------------------------------*/
1976 int64
float128_to_int64_round_to_zero( float128 a
)
1979 int32 aExp
, shiftCount
;
1980 bits64 aSig0
, aSig1
;
1983 aSig1
= extractFloat128Frac1( a
);
1984 aSig0
= extractFloat128Frac0( a
);
1985 aExp
= extractFloat128Exp( a
);
1986 aSign
= extractFloat128Sign( a
);
1987 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
1988 shiftCount
= aExp
- 0x402F;
1989 if ( 0 < shiftCount
) {
1990 if ( 0x403E <= aExp
) {
1991 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
1992 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
1993 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
1994 if ( aSig1
) float_exception_flags
|= float_flag_inexact
;
1997 float_raise( float_flag_invalid
);
1998 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
1999 return LIT64( 0x7FFFFFFFFFFFFFFF );
2002 return (sbits64
) LIT64( 0x8000000000000000 );
2004 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
2005 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
2006 float_exception_flags
|= float_flag_inexact
;
2010 if ( aExp
< 0x3FFF ) {
2011 if ( aExp
| aSig0
| aSig1
) {
2012 float_exception_flags
|= float_flag_inexact
;
2016 z
= aSig0
>>( - shiftCount
);
2018 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
2019 float_exception_flags
|= float_flag_inexact
;
2022 if ( aSign
) z
= - z
;
2027 /*----------------------------------------------------------------------------
2028 | Returns the result of converting the quadruple-precision floating-point
2029 | value `a' to the single-precision floating-point format. The conversion
2030 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2032 *----------------------------------------------------------------------------*/
2034 float32
float128_to_float32( float128 a
)
2038 bits64 aSig0
, aSig1
;
2041 aSig1
= extractFloat128Frac1( a
);
2042 aSig0
= extractFloat128Frac0( a
);
2043 aExp
= extractFloat128Exp( a
);
2044 aSign
= extractFloat128Sign( a
);
2045 if ( aExp
== 0x7FFF ) {
2046 if ( aSig0
| aSig1
) {
2047 return commonNaNToFloat32( float128ToCommonNaN( a
) );
2049 return packFloat32( aSign
, 0xFF, 0 );
2051 aSig0
|= ( aSig1
!= 0 );
2052 shift64RightJamming( aSig0
, 18, &aSig0
);
2054 if ( aExp
|| zSig
) {
2058 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2062 /*----------------------------------------------------------------------------
2063 | Returns the result of converting the quadruple-precision floating-point
2064 | value `a' to the double-precision floating-point format. The conversion
2065 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2067 *----------------------------------------------------------------------------*/
2069 float64
float128_to_float64( float128 a
)
2073 bits64 aSig0
, aSig1
;
2075 aSig1
= extractFloat128Frac1( a
);
2076 aSig0
= extractFloat128Frac0( a
);
2077 aExp
= extractFloat128Exp( a
);
2078 aSign
= extractFloat128Sign( a
);
2079 if ( aExp
== 0x7FFF ) {
2080 if ( aSig0
| aSig1
) {
2081 return commonNaNToFloat64( float128ToCommonNaN( a
) );
2083 return packFloat64( aSign
, 0x7FF, 0 );
2085 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
2086 aSig0
|= ( aSig1
!= 0 );
2087 if ( aExp
|| aSig0
) {
2088 aSig0
|= LIT64( 0x4000000000000000 );
2091 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
2097 /*----------------------------------------------------------------------------
2098 | Returns the result of converting the quadruple-precision floating-point
2099 | value `a' to the extended double-precision floating-point format. The
2100 | conversion is performed according to the IEC/IEEE Standard for Binary
2101 | Floating-Point Arithmetic.
2102 *----------------------------------------------------------------------------*/
2104 floatx80
float128_to_floatx80( float128 a
)
2108 bits64 aSig0
, aSig1
;
2110 aSig1
= extractFloat128Frac1( a
);
2111 aSig0
= extractFloat128Frac0( a
);
2112 aExp
= extractFloat128Exp( a
);
2113 aSign
= extractFloat128Sign( a
);
2114 if ( aExp
== 0x7FFF ) {
2115 if ( aSig0
| aSig1
) {
2116 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
2118 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2121 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
2122 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2125 aSig0
|= LIT64( 0x0001000000000000 );
2127 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
2128 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
2134 /*----------------------------------------------------------------------------
2135 | Rounds the quadruple-precision floating-point value `a' to an integer, and
2136 | returns the result as a quadruple-precision floating-point value. The
2137 | operation is performed according to the IEC/IEEE Standard for Binary
2138 | Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2141 float128
float128_round_to_int( float128 a
)
2145 bits64 lastBitMask
, roundBitsMask
;
2149 aExp
= extractFloat128Exp( a
);
2150 if ( 0x402F <= aExp
) {
2151 if ( 0x406F <= aExp
) {
2152 if ( ( aExp
== 0x7FFF )
2153 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
2155 return propagateFloat128NaN( a
, a
);
2160 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
2161 roundBitsMask
= lastBitMask
- 1;
2163 roundingMode
= float_rounding_mode
;
2164 if ( roundingMode
== float_round_nearest_even
) {
2165 if ( lastBitMask
) {
2166 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
2167 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
2170 if ( (sbits64
) z
.low
< 0 ) {
2172 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
2176 else if ( roundingMode
!= float_round_to_zero
) {
2177 if ( extractFloat128Sign( z
)
2178 ^ ( roundingMode
== float_round_up
) ) {
2179 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
2182 z
.low
&= ~ roundBitsMask
;
2185 if ( aExp
< 0x3FFF ) {
2186 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
2187 float_exception_flags
|= float_flag_inexact
;
2188 aSign
= extractFloat128Sign( a
);
2189 switch ( float_rounding_mode
) {
2190 case float_round_nearest_even
:
2191 if ( ( aExp
== 0x3FFE )
2192 && ( extractFloat128Frac0( a
)
2193 | extractFloat128Frac1( a
) )
2195 return packFloat128( aSign
, 0x3FFF, 0, 0 );
2198 case float_round_down
:
2200 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
2201 : packFloat128( 0, 0, 0, 0 );
2202 case float_round_up
:
2204 aSign
? packFloat128( 1, 0, 0, 0 )
2205 : packFloat128( 0, 0x3FFF, 0, 0 );
2207 return packFloat128( aSign
, 0, 0, 0 );
2210 lastBitMask
<<= 0x402F - aExp
;
2211 roundBitsMask
= lastBitMask
- 1;
2214 roundingMode
= float_rounding_mode
;
2215 if ( roundingMode
== float_round_nearest_even
) {
2216 z
.high
+= lastBitMask
>>1;
2217 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
2218 z
.high
&= ~ lastBitMask
;
2221 else if ( roundingMode
!= float_round_to_zero
) {
2222 if ( extractFloat128Sign( z
)
2223 ^ ( roundingMode
== float_round_up
) ) {
2224 z
.high
|= ( a
.low
!= 0 );
2225 z
.high
+= roundBitsMask
;
2228 z
.high
&= ~ roundBitsMask
;
2230 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
2231 float_exception_flags
|= float_flag_inexact
;
2237 /*----------------------------------------------------------------------------
2238 | Returns the result of adding the absolute values of the quadruple-precision
2239 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2240 | before being returned. `zSign' is ignored if the result is a NaN.
2241 | The addition is performed according to the IEC/IEEE Standard for Binary
2242 | Floating-Point Arithmetic.
2243 *----------------------------------------------------------------------------*/
2245 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
2247 int32 aExp
, bExp
, zExp
;
2248 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
2251 aSig1
= extractFloat128Frac1( a
);
2252 aSig0
= extractFloat128Frac0( a
);
2253 aExp
= extractFloat128Exp( a
);
2254 bSig1
= extractFloat128Frac1( b
);
2255 bSig0
= extractFloat128Frac0( b
);
2256 bExp
= extractFloat128Exp( b
);
2257 expDiff
= aExp
- bExp
;
2258 if ( 0 < expDiff
) {
2259 if ( aExp
== 0x7FFF ) {
2260 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
2267 bSig0
|= LIT64( 0x0001000000000000 );
2269 shift128ExtraRightJamming(
2270 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
2273 else if ( expDiff
< 0 ) {
2274 if ( bExp
== 0x7FFF ) {
2275 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
2276 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2282 aSig0
|= LIT64( 0x0001000000000000 );
2284 shift128ExtraRightJamming(
2285 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
2289 if ( aExp
== 0x7FFF ) {
2290 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
2291 return propagateFloat128NaN( a
, b
);
2295 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
2296 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
2298 zSig0
|= LIT64( 0x0002000000000000 );
2302 aSig0
|= LIT64( 0x0001000000000000 );
2303 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
2305 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
2308 shift128ExtraRightJamming(
2309 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
2311 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
2315 /*----------------------------------------------------------------------------
2316 | Returns the result of subtracting the absolute values of the quadruple-
2317 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2318 | difference is negated before being returned. `zSign' is ignored if the
2319 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2320 | Standard for Binary Floating-Point Arithmetic.
2321 *----------------------------------------------------------------------------*/
2323 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
2325 int32 aExp
, bExp
, zExp
;
2326 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
2330 aSig1
= extractFloat128Frac1( a
);
2331 aSig0
= extractFloat128Frac0( a
);
2332 aExp
= extractFloat128Exp( a
);
2333 bSig1
= extractFloat128Frac1( b
);
2334 bSig0
= extractFloat128Frac0( b
);
2335 bExp
= extractFloat128Exp( b
);
2336 expDiff
= aExp
- bExp
;
2337 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
2338 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
2339 if ( 0 < expDiff
) goto aExpBigger
;
2340 if ( expDiff
< 0 ) goto bExpBigger
;
2341 if ( aExp
== 0x7FFF ) {
2342 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
2343 return propagateFloat128NaN( a
, b
);
2345 float_raise( float_flag_invalid
);
2346 z
.low
= float128_default_nan_low
;
2347 z
.high
= float128_default_nan_high
;
2354 if ( bSig0
< aSig0
) goto aBigger
;
2355 if ( aSig0
< bSig0
) goto bBigger
;
2356 if ( bSig1
< aSig1
) goto aBigger
;
2357 if ( aSig1
< bSig1
) goto bBigger
;
2358 return packFloat128( float_rounding_mode
== float_round_down
, 0, 0, 0 );
2360 if ( bExp
== 0x7FFF ) {
2361 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
2362 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
2368 aSig0
|= LIT64( 0x4000000000000000 );
2370 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
2371 bSig0
|= LIT64( 0x4000000000000000 );
2373 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
2376 goto normalizeRoundAndPack
;
2378 if ( aExp
== 0x7FFF ) {
2379 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
2386 bSig0
|= LIT64( 0x4000000000000000 );
2388 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
2389 aSig0
|= LIT64( 0x4000000000000000 );
2391 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
2393 normalizeRoundAndPack
:
2395 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
2399 /*----------------------------------------------------------------------------
2400 | Returns the result of adding the quadruple-precision floating-point values
2401 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2402 | for Binary Floating-Point Arithmetic.
2403 *----------------------------------------------------------------------------*/
2405 float128
float128_add( float128 a
, float128 b
)
2409 aSign
= extractFloat128Sign( a
);
2410 bSign
= extractFloat128Sign( b
);
2411 if ( aSign
== bSign
) {
2412 return addFloat128Sigs( a
, b
, aSign
);
2415 return subFloat128Sigs( a
, b
, aSign
);
2420 /*----------------------------------------------------------------------------
2421 | Returns the result of subtracting the quadruple-precision floating-point
2422 | values `a' and `b'. The operation is performed according to the IEC/IEEE
2423 | Standard for Binary Floating-Point Arithmetic.
2424 *----------------------------------------------------------------------------*/
2426 float128
float128_sub( float128 a
, float128 b
)
2430 aSign
= extractFloat128Sign( a
);
2431 bSign
= extractFloat128Sign( b
);
2432 if ( aSign
== bSign
) {
2433 return subFloat128Sigs( a
, b
, aSign
);
2436 return addFloat128Sigs( a
, b
, aSign
);
2441 /*----------------------------------------------------------------------------
2442 | Returns the result of multiplying the quadruple-precision floating-point
2443 | values `a' and `b'. The operation is performed according to the IEC/IEEE
2444 | Standard for Binary Floating-Point Arithmetic.
2445 *----------------------------------------------------------------------------*/
2447 float128
float128_mul( float128 a
, float128 b
)
2449 flag aSign
, bSign
, zSign
;
2450 int32 aExp
, bExp
, zExp
;
2451 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
2454 aSig1
= extractFloat128Frac1( a
);
2455 aSig0
= extractFloat128Frac0( a
);
2456 aExp
= extractFloat128Exp( a
);
2457 aSign
= extractFloat128Sign( a
);
2458 bSig1
= extractFloat128Frac1( b
);
2459 bSig0
= extractFloat128Frac0( b
);
2460 bExp
= extractFloat128Exp( b
);
2461 bSign
= extractFloat128Sign( b
);
2462 zSign
= aSign
^ bSign
;
2463 if ( aExp
== 0x7FFF ) {
2464 if ( ( aSig0
| aSig1
)
2465 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
2466 return propagateFloat128NaN( a
, b
);
2468 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
2469 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2471 if ( bExp
== 0x7FFF ) {
2472 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
2473 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
2475 float_raise( float_flag_invalid
);
2476 z
.low
= float128_default_nan_low
;
2477 z
.high
= float128_default_nan_high
;
2480 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2483 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
2484 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2487 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
2488 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
2490 zExp
= aExp
+ bExp
- 0x4000;
2491 aSig0
|= LIT64( 0x0001000000000000 );
2492 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
2493 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
2494 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
2495 zSig2
|= ( zSig3
!= 0 );
2496 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
2497 shift128ExtraRightJamming(
2498 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
2501 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
2505 /*----------------------------------------------------------------------------
2506 | Returns the result of dividing the quadruple-precision floating-point value
2507 | `a' by the corresponding value `b'. The operation is performed according to
2508 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2509 *----------------------------------------------------------------------------*/
2511 float128
float128_div( float128 a
, float128 b
)
2513 flag aSign
, bSign
, zSign
;
2514 int32 aExp
, bExp
, zExp
;
2515 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
2516 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
2519 aSig1
= extractFloat128Frac1( a
);
2520 aSig0
= extractFloat128Frac0( a
);
2521 aExp
= extractFloat128Exp( a
);
2522 aSign
= extractFloat128Sign( a
);
2523 bSig1
= extractFloat128Frac1( b
);
2524 bSig0
= extractFloat128Frac0( b
);
2525 bExp
= extractFloat128Exp( b
);
2526 bSign
= extractFloat128Sign( b
);
2527 zSign
= aSign
^ bSign
;
2528 if ( aExp
== 0x7FFF ) {
2529 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
2530 if ( bExp
== 0x7FFF ) {
2531 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
2534 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2536 if ( bExp
== 0x7FFF ) {
2537 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
2538 return packFloat128( zSign
, 0, 0, 0 );
2541 if ( ( bSig0
| bSig1
) == 0 ) {
2542 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
2544 float_raise( float_flag_invalid
);
2545 z
.low
= float128_default_nan_low
;
2546 z
.high
= float128_default_nan_high
;
2549 float_raise( float_flag_divbyzero
);
2550 return packFloat128( zSign
, 0x7FFF, 0, 0 );
2552 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
2555 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
2556 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2558 zExp
= aExp
- bExp
+ 0x3FFD;
2560 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
2562 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
2563 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
2564 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
2567 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
2568 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
2569 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
2570 while ( (sbits64
) rem0
< 0 ) {
2572 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
2574 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
2575 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
2576 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
2577 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
2578 while ( (sbits64
) rem1
< 0 ) {
2580 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
2582 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2584 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
2585 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
2589 /*----------------------------------------------------------------------------
2590 | Returns the remainder of the quadruple-precision floating-point value `a'
2591 | with respect to the corresponding value `b'. The operation is performed
2592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2593 *----------------------------------------------------------------------------*/
2595 float128
float128_rem( float128 a
, float128 b
)
2597 flag aSign
, bSign
, zSign
;
2598 int32 aExp
, bExp
, expDiff
;
2599 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
2600 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
2604 aSig1
= extractFloat128Frac1( a
);
2605 aSig0
= extractFloat128Frac0( a
);
2606 aExp
= extractFloat128Exp( a
);
2607 aSign
= extractFloat128Sign( a
);
2608 bSig1
= extractFloat128Frac1( b
);
2609 bSig0
= extractFloat128Frac0( b
);
2610 bExp
= extractFloat128Exp( b
);
2611 bSign
= extractFloat128Sign( b
);
2612 if ( aExp
== 0x7FFF ) {
2613 if ( ( aSig0
| aSig1
)
2614 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
2615 return propagateFloat128NaN( a
, b
);
2619 if ( bExp
== 0x7FFF ) {
2620 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
2624 if ( ( bSig0
| bSig1
) == 0 ) {
2626 float_raise( float_flag_invalid
);
2627 z
.low
= float128_default_nan_low
;
2628 z
.high
= float128_default_nan_high
;
2631 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
2634 if ( ( aSig0
| aSig1
) == 0 ) return a
;
2635 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2637 expDiff
= aExp
- bExp
;
2638 if ( expDiff
< -1 ) return a
;
2640 aSig0
| LIT64( 0x0001000000000000 ),
2642 15 - ( expDiff
< 0 ),
2647 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
2648 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
2649 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2651 while ( 0 < expDiff
) {
2652 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
2653 q
= ( 4 < q
) ? q
- 4 : 0;
2654 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2655 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
2656 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
2657 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
2660 if ( -64 < expDiff
) {
2661 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
2662 q
= ( 4 < q
) ? q
- 4 : 0;
2664 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
2666 if ( expDiff
< 0 ) {
2667 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
2670 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
2672 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
2673 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
2676 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
2677 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
2680 alternateASig0
= aSig0
;
2681 alternateASig1
= aSig1
;
2683 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
2684 } while ( 0 <= (sbits64
) aSig0
);
2686 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
2687 if ( ( sigMean0
< 0 )
2688 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
2689 aSig0
= alternateASig0
;
2690 aSig1
= alternateASig1
;
2692 zSign
= ( (sbits64
) aSig0
< 0 );
2693 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
2695 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
2699 /*----------------------------------------------------------------------------
2700 | Returns the square root of the quadruple-precision floating-point value `a'.
2701 | The operation is performed according to the IEC/IEEE Standard for Binary
2702 | Floating-Point Arithmetic.
2703 *----------------------------------------------------------------------------*/
2705 float128
float128_sqrt( float128 a
)
2709 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
2710 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
2713 aSig1
= extractFloat128Frac1( a
);
2714 aSig0
= extractFloat128Frac0( a
);
2715 aExp
= extractFloat128Exp( a
);
2716 aSign
= extractFloat128Sign( a
);
2717 if ( aExp
== 0x7FFF ) {
2718 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
2719 if ( ! aSign
) return a
;
2723 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
2725 float_raise( float_flag_invalid
);
2726 z
.low
= float128_default_nan_low
;
2727 z
.high
= float128_default_nan_high
;
2731 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
2732 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
2734 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
2735 aSig0
|= LIT64( 0x0001000000000000 );
2736 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
2737 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
2738 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
2739 doubleZSig0
= zSig0
<<1;
2740 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
2741 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
2742 while ( (sbits64
) rem0
< 0 ) {
2745 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
2747 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
2748 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
2749 if ( zSig1
== 0 ) zSig1
= 1;
2750 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
2751 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
2752 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
2753 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2754 while ( (sbits64
) rem1
< 0 ) {
2756 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
2758 term2
|= doubleZSig0
;
2759 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
2761 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
2763 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
2764 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
2768 /*----------------------------------------------------------------------------
2769 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
2770 | the corresponding value `b', and 0 otherwise. The comparison is performed
2771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2772 *----------------------------------------------------------------------------*/
2774 flag
float128_eq( float128 a
, float128 b
)
2777 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
2778 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
2779 || ( ( extractFloat128Exp( b
) == 0x7FFF )
2780 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
2782 if ( float128_is_signaling_nan( a
)
2783 || float128_is_signaling_nan( b
) ) {
2784 float_raise( float_flag_invalid
);
2790 && ( ( a
.high
== b
.high
)
2792 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
2797 /*----------------------------------------------------------------------------
2798 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2799 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2800 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2802 *----------------------------------------------------------------------------*/
2804 flag
float128_le( float128 a
, float128 b
)
2808 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
2809 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
2810 || ( ( extractFloat128Exp( b
) == 0x7FFF )
2811 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
2813 float_raise( float_flag_invalid
);
2816 aSign
= extractFloat128Sign( a
);
2817 bSign
= extractFloat128Sign( b
);
2818 if ( aSign
!= bSign
) {
2821 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
2825 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
2826 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
2830 /*----------------------------------------------------------------------------
2831 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2832 | the corresponding value `b', and 0 otherwise. The comparison is performed
2833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2834 *----------------------------------------------------------------------------*/
2836 flag
float128_lt( float128 a
, float128 b
)
2840 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
2841 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
2842 || ( ( extractFloat128Exp( b
) == 0x7FFF )
2843 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
2845 float_raise( float_flag_invalid
);
2848 aSign
= extractFloat128Sign( a
);
2849 bSign
= extractFloat128Sign( b
);
2850 if ( aSign
!= bSign
) {
2853 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
2857 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
2858 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
2862 /*----------------------------------------------------------------------------
2863 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
2864 | the corresponding value `b', and 0 otherwise. The invalid exception is
2865 | raised if either operand is a NaN. Otherwise, the comparison is performed
2866 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2869 flag
float128_eq_signaling( float128 a
, float128 b
)
2872 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
2873 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
2874 || ( ( extractFloat128Exp( b
) == 0x7FFF )
2875 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
2877 float_raise( float_flag_invalid
);
2882 && ( ( a
.high
== b
.high
)
2884 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
2889 /*----------------------------------------------------------------------------
2890 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2891 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2892 | cause an exception. Otherwise, the comparison is performed according to the
2893 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2894 *----------------------------------------------------------------------------*/
2896 flag
float128_le_quiet( float128 a
, float128 b
)
2900 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
2901 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
2902 || ( ( extractFloat128Exp( b
) == 0x7FFF )
2903 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
2905 if ( float128_is_signaling_nan( a
)
2906 || float128_is_signaling_nan( b
) ) {
2907 float_raise( float_flag_invalid
);
2911 aSign
= extractFloat128Sign( a
);
2912 bSign
= extractFloat128Sign( b
);
2913 if ( aSign
!= bSign
) {
2916 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
2920 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
2921 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
2925 /*----------------------------------------------------------------------------
2926 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2927 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2928 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2929 | Standard for Binary Floating-Point Arithmetic.
2930 *----------------------------------------------------------------------------*/
2932 flag
float128_lt_quiet( float128 a
, float128 b
)
2936 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
2937 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
2938 || ( ( extractFloat128Exp( b
) == 0x7FFF )
2939 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
2941 if ( float128_is_signaling_nan( a
)
2942 || float128_is_signaling_nan( b
) ) {
2943 float_raise( float_flag_invalid
);
2947 aSign
= extractFloat128Sign( a
);
2948 bSign
= extractFloat128Sign( b
);
2949 if ( aSign
!= bSign
) {
2952 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
2956 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
2957 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);