2 /*============================================================================
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
31 =============================================================================*/
34 #include "softfloat.h"
36 /*----------------------------------------------------------------------------
37 | Floating-point rounding mode, extended double-precision rounding precision,
38 | and exception flags.
39 *----------------------------------------------------------------------------*/
40 int8 float_rounding_mode
= float_round_nearest_even
;
41 int8 float_exception_flags
= 0;
43 int8 floatx80_rounding_precision
= 80;
46 /*----------------------------------------------------------------------------
47 | Primitive arithmetic functions, including multi-word arithmetic, and
48 | division and square root approximations. (Can be specialized to target if
50 *----------------------------------------------------------------------------*/
51 #include "softfloat-macros"
53 /*----------------------------------------------------------------------------
54 | Functions and definitions to determine: (1) whether tininess for underflow
55 | is detected before or after rounding by default, (2) what (if anything)
56 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
57 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
58 | are propagated from function inputs to output. These details are target-
60 *----------------------------------------------------------------------------*/
61 #include "softfloat-specialize"
63 /*----------------------------------------------------------------------------
64 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
65 | and 7, and returns the properly rounded 32-bit integer corresponding to the
66 | input. If `zSign' is 1, the input is negated before being converted to an
67 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
68 | is simply rounded to an integer, with the inexact exception raised if the
69 | input cannot be represented exactly as an integer. However, if the fixed-
70 | point input is too large, the invalid exception is raised and the largest
71 | positive or negative integer is returned.
72 *----------------------------------------------------------------------------*/
74 static int32
roundAndPackInt32( flag zSign
, bits64 absZ
)
77 flag roundNearestEven
;
78 int8 roundIncrement
, roundBits
;
81 roundingMode
= float_rounding_mode
;
82 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
83 roundIncrement
= 0x40;
84 if ( ! roundNearestEven
) {
85 if ( roundingMode
== float_round_to_zero
) {
89 roundIncrement
= 0x7F;
91 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
94 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
98 roundBits
= absZ
& 0x7F;
99 absZ
= ( absZ
+ roundIncrement
)>>7;
100 absZ
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
102 if ( zSign
) z
= - z
;
103 if ( ( absZ
>>32 ) || ( z
&& ( ( z
< 0 ) ^ zSign
) ) ) {
104 float_raise( float_flag_invalid
);
105 return zSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
107 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
112 /*----------------------------------------------------------------------------
113 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
114 | `absZ1', with binary point between bits 63 and 64 (between the input words),
115 | and returns the properly rounded 64-bit integer corresponding to the input.
116 | If `zSign' is 1, the input is negated before being converted to an integer.
117 | Ordinarily, the fixed-point input is simply rounded to an integer, with
118 | the inexact exception raised if the input cannot be represented exactly as
119 | an integer. However, if the fixed-point input is too large, the invalid
120 | exception is raised and the largest positive or negative integer is
122 *----------------------------------------------------------------------------*/
124 static int64
roundAndPackInt64( flag zSign
, bits64 absZ0
, bits64 absZ1
)
127 flag roundNearestEven
, increment
;
130 roundingMode
= float_rounding_mode
;
131 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
132 increment
= ( (sbits64
) absZ1
< 0 );
133 if ( ! roundNearestEven
) {
134 if ( roundingMode
== float_round_to_zero
) {
139 increment
= ( roundingMode
== float_round_down
) && absZ1
;
142 increment
= ( roundingMode
== float_round_up
) && absZ1
;
148 if ( absZ0
== 0 ) goto overflow
;
149 absZ0
&= ~ ( ( (bits64
) ( absZ1
<<1 ) == 0 ) & roundNearestEven
);
152 if ( zSign
) z
= - z
;
153 if ( z
&& ( ( z
< 0 ) ^ zSign
) ) {
155 float_raise( float_flag_invalid
);
157 zSign
? (sbits64
) LIT64( 0x8000000000000000 )
158 : LIT64( 0x7FFFFFFFFFFFFFFF );
160 if ( absZ1
) float_exception_flags
|= float_flag_inexact
;
165 /*----------------------------------------------------------------------------
166 | Returns the fraction bits of the single-precision floating-point value `a'.
167 *----------------------------------------------------------------------------*/
169 INLINE bits32
extractFloat32Frac( float32 a
)
172 return a
& 0x007FFFFF;
176 /*----------------------------------------------------------------------------
177 | Returns the exponent bits of the single-precision floating-point value `a'.
178 *----------------------------------------------------------------------------*/
180 INLINE int16
extractFloat32Exp( float32 a
)
183 return ( a
>>23 ) & 0xFF;
187 /*----------------------------------------------------------------------------
188 | Returns the sign bit of the single-precision floating-point value `a'.
189 *----------------------------------------------------------------------------*/
191 INLINE flag
extractFloat32Sign( float32 a
)
198 /*----------------------------------------------------------------------------
199 | Normalizes the subnormal single-precision floating-point value represented
200 | by the denormalized significand `aSig'. The normalized exponent and
201 | significand are stored at the locations pointed to by `zExpPtr' and
202 | `zSigPtr', respectively.
203 *----------------------------------------------------------------------------*/
206 normalizeFloat32Subnormal( bits32 aSig
, int16
*zExpPtr
, bits32
*zSigPtr
)
210 shiftCount
= countLeadingZeros32( aSig
) - 8;
211 *zSigPtr
= aSig
<<shiftCount
;
212 *zExpPtr
= 1 - shiftCount
;
216 /*----------------------------------------------------------------------------
217 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
218 | single-precision floating-point value, returning the result. After being
219 | shifted into the proper positions, the three fields are simply added
220 | together to form the result. This means that any integer portion of `zSig'
221 | will be added into the exponent. Since a properly normalized significand
222 | will have an integer portion equal to 1, the `zExp' input should be 1 less
223 | than the desired result exponent whenever `zSig' is a complete, normalized
225 *----------------------------------------------------------------------------*/
227 INLINE float32
packFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
230 return ( ( (bits32
) zSign
)<<31 ) + ( ( (bits32
) zExp
)<<23 ) + zSig
;
234 /*----------------------------------------------------------------------------
235 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
236 | and significand `zSig', and returns the proper single-precision floating-
237 | point value corresponding to the abstract input. Ordinarily, the abstract
238 | value is simply rounded and packed into the single-precision format, with
239 | the inexact exception raised if the abstract input cannot be represented
240 | exactly. However, if the abstract value is too large, the overflow and
241 | inexact exceptions are raised and an infinity or maximal finite value is
242 | returned. If the abstract value is too small, the input value is rounded to
243 | a subnormal number, and the underflow and inexact exceptions are raised if
244 | the abstract input cannot be represented exactly as a subnormal single-
245 | precision floating-point number.
246 | The input significand `zSig' has its binary point between bits 30
247 | and 29, which is 7 bits to the left of the usual location. This shifted
248 | significand must be normalized or smaller. If `zSig' is not normalized,
249 | `zExp' must be 0; in that case, the result returned is a subnormal number,
250 | and it must not require rounding. In the usual case that `zSig' is
251 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
252 | The handling of underflow and overflow follows the IEC/IEEE Standard for
253 | Binary Floating-Point Arithmetic.
254 *----------------------------------------------------------------------------*/
256 static float32
roundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
259 flag roundNearestEven
;
260 int8 roundIncrement
, roundBits
;
263 roundingMode
= float_rounding_mode
;
264 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
265 roundIncrement
= 0x40;
266 if ( ! roundNearestEven
) {
267 if ( roundingMode
== float_round_to_zero
) {
271 roundIncrement
= 0x7F;
273 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
276 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
280 roundBits
= zSig
& 0x7F;
281 if ( 0xFD <= (bits16
) zExp
) {
283 || ( ( zExp
== 0xFD )
284 && ( (sbits32
) ( zSig
+ roundIncrement
) < 0 ) )
286 float_raise( float_flag_overflow
| float_flag_inexact
);
287 return packFloat32( zSign
, 0xFF, 0 ) - ( roundIncrement
== 0 );
291 ( float_detect_tininess
== float_tininess_before_rounding
)
293 || ( zSig
+ roundIncrement
< 0x80000000 );
294 shift32RightJamming( zSig
, - zExp
, &zSig
);
296 roundBits
= zSig
& 0x7F;
297 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
300 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
301 zSig
= ( zSig
+ roundIncrement
)>>7;
302 zSig
&= ~ ( ( ( roundBits
^ 0x40 ) == 0 ) & roundNearestEven
);
303 if ( zSig
== 0 ) zExp
= 0;
304 return packFloat32( zSign
, zExp
, zSig
);
308 /*----------------------------------------------------------------------------
309 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
310 | and significand `zSig', and returns the proper single-precision floating-
311 | point value corresponding to the abstract input. This routine is just like
312 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
313 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
314 | floating-point exponent.
315 *----------------------------------------------------------------------------*/
318 normalizeRoundAndPackFloat32( flag zSign
, int16 zExp
, bits32 zSig
)
322 shiftCount
= countLeadingZeros32( zSig
) - 1;
323 return roundAndPackFloat32( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
327 /*----------------------------------------------------------------------------
328 | Returns the fraction bits of the double-precision floating-point value `a'.
329 *----------------------------------------------------------------------------*/
331 INLINE bits64
extractFloat64Frac( float64 a
)
334 return a
& LIT64( 0x000FFFFFFFFFFFFF );
338 /*----------------------------------------------------------------------------
339 | Returns the exponent bits of the double-precision floating-point value `a'.
340 *----------------------------------------------------------------------------*/
342 INLINE int16
extractFloat64Exp( float64 a
)
345 return ( a
>>52 ) & 0x7FF;
349 /*----------------------------------------------------------------------------
350 | Returns the sign bit of the double-precision floating-point value `a'.
351 *----------------------------------------------------------------------------*/
353 INLINE flag
extractFloat64Sign( float64 a
)
360 /*----------------------------------------------------------------------------
361 | Normalizes the subnormal double-precision floating-point value represented
362 | by the denormalized significand `aSig'. The normalized exponent and
363 | significand are stored at the locations pointed to by `zExpPtr' and
364 | `zSigPtr', respectively.
365 *----------------------------------------------------------------------------*/
368 normalizeFloat64Subnormal( bits64 aSig
, int16
*zExpPtr
, bits64
*zSigPtr
)
372 shiftCount
= countLeadingZeros64( aSig
) - 11;
373 *zSigPtr
= aSig
<<shiftCount
;
374 *zExpPtr
= 1 - shiftCount
;
378 /*----------------------------------------------------------------------------
379 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
380 | double-precision floating-point value, returning the result. After being
381 | shifted into the proper positions, the three fields are simply added
382 | together to form the result. This means that any integer portion of `zSig'
383 | will be added into the exponent. Since a properly normalized significand
384 | will have an integer portion equal to 1, the `zExp' input should be 1 less
385 | than the desired result exponent whenever `zSig' is a complete, normalized
387 *----------------------------------------------------------------------------*/
389 INLINE float64
packFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
392 return ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<52 ) + zSig
;
396 /*----------------------------------------------------------------------------
397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
398 | and significand `zSig', and returns the proper double-precision floating-
399 | point value corresponding to the abstract input. Ordinarily, the abstract
400 | value is simply rounded and packed into the double-precision format, with
401 | the inexact exception raised if the abstract input cannot be represented
402 | exactly. However, if the abstract value is too large, the overflow and
403 | inexact exceptions are raised and an infinity or maximal finite value is
404 | returned. If the abstract value is too small, the input value is rounded
405 | to a subnormal number, and the underflow and inexact exceptions are raised
406 | if the abstract input cannot be represented exactly as a subnormal double-
407 | precision floating-point number.
408 | The input significand `zSig' has its binary point between bits 62
409 | and 61, which is 10 bits to the left of the usual location. This shifted
410 | significand must be normalized or smaller. If `zSig' is not normalized,
411 | `zExp' must be 0; in that case, the result returned is a subnormal number,
412 | and it must not require rounding. In the usual case that `zSig' is
413 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
414 | The handling of underflow and overflow follows the IEC/IEEE Standard for
415 | Binary Floating-Point Arithmetic.
416 *----------------------------------------------------------------------------*/
418 static float64
roundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
421 flag roundNearestEven
;
422 int16 roundIncrement
, roundBits
;
425 roundingMode
= float_rounding_mode
;
426 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
427 roundIncrement
= 0x200;
428 if ( ! roundNearestEven
) {
429 if ( roundingMode
== float_round_to_zero
) {
433 roundIncrement
= 0x3FF;
435 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
438 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
442 roundBits
= zSig
& 0x3FF;
443 if ( 0x7FD <= (bits16
) zExp
) {
444 if ( ( 0x7FD < zExp
)
445 || ( ( zExp
== 0x7FD )
446 && ( (sbits64
) ( zSig
+ roundIncrement
) < 0 ) )
448 float_raise( float_flag_overflow
| float_flag_inexact
);
449 return packFloat64( zSign
, 0x7FF, 0 ) - ( roundIncrement
== 0 );
453 ( float_detect_tininess
== float_tininess_before_rounding
)
455 || ( zSig
+ roundIncrement
< LIT64( 0x8000000000000000 ) );
456 shift64RightJamming( zSig
, - zExp
, &zSig
);
458 roundBits
= zSig
& 0x3FF;
459 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
462 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
463 zSig
= ( zSig
+ roundIncrement
)>>10;
464 zSig
&= ~ ( ( ( roundBits
^ 0x200 ) == 0 ) & roundNearestEven
);
465 if ( zSig
== 0 ) zExp
= 0;
466 return packFloat64( zSign
, zExp
, zSig
);
470 /*----------------------------------------------------------------------------
471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472 | and significand `zSig', and returns the proper double-precision floating-
473 | point value corresponding to the abstract input. This routine is just like
474 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
475 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
476 | floating-point exponent.
477 *----------------------------------------------------------------------------*/
480 normalizeRoundAndPackFloat64( flag zSign
, int16 zExp
, bits64 zSig
)
484 shiftCount
= countLeadingZeros64( zSig
) - 1;
485 return roundAndPackFloat64( zSign
, zExp
- shiftCount
, zSig
<<shiftCount
);
491 /*----------------------------------------------------------------------------
492 | Returns the fraction bits of the extended double-precision floating-point
494 *----------------------------------------------------------------------------*/
496 INLINE bits64
extractFloatx80Frac( floatx80 a
)
503 /*----------------------------------------------------------------------------
504 | Returns the exponent bits of the extended double-precision floating-point
506 *----------------------------------------------------------------------------*/
508 INLINE int32
extractFloatx80Exp( floatx80 a
)
511 return a
.high
& 0x7FFF;
515 /*----------------------------------------------------------------------------
516 | Returns the sign bit of the extended double-precision floating-point value
518 *----------------------------------------------------------------------------*/
520 INLINE flag
extractFloatx80Sign( floatx80 a
)
527 /*----------------------------------------------------------------------------
528 | Normalizes the subnormal extended double-precision floating-point value
529 | represented by the denormalized significand `aSig'. The normalized exponent
530 | and significand are stored at the locations pointed to by `zExpPtr' and
531 | `zSigPtr', respectively.
532 *----------------------------------------------------------------------------*/
535 normalizeFloatx80Subnormal( bits64 aSig
, int32
*zExpPtr
, bits64
*zSigPtr
)
539 shiftCount
= countLeadingZeros64( aSig
);
540 *zSigPtr
= aSig
<<shiftCount
;
541 *zExpPtr
= 1 - shiftCount
;
545 /*----------------------------------------------------------------------------
546 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
547 | extended double-precision floating-point value, returning the result.
548 *----------------------------------------------------------------------------*/
550 INLINE floatx80
packFloatx80( flag zSign
, int32 zExp
, bits64 zSig
)
555 z
.high
= ( ( (bits16
) zSign
)<<15 ) + zExp
;
560 /*----------------------------------------------------------------------------
561 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
562 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
563 | and returns the proper extended double-precision floating-point value
564 | corresponding to the abstract input. Ordinarily, the abstract value is
565 | rounded and packed into the extended double-precision format, with the
566 | inexact exception raised if the abstract input cannot be represented
567 | exactly. However, if the abstract value is too large, the overflow and
568 | inexact exceptions are raised and an infinity or maximal finite value is
569 | returned. If the abstract value is too small, the input value is rounded to
570 | a subnormal number, and the underflow and inexact exceptions are raised if
571 | the abstract input cannot be represented exactly as a subnormal extended
572 | double-precision floating-point number.
573 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
574 | number of bits as single or double precision, respectively. Otherwise, the
575 | result is rounded to the full precision of the extended double-precision
577 | The input significand must be normalized or smaller. If the input
578 | significand is not normalized, `zExp' must be 0; in that case, the result
579 | returned is a subnormal number, and it must not require rounding. The
580 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
581 | Floating-Point Arithmetic.
582 *----------------------------------------------------------------------------*/
585 roundAndPackFloatx80(
586 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
590 flag roundNearestEven
, increment
, isTiny
;
591 int64 roundIncrement
, roundMask
, roundBits
;
593 roundingMode
= float_rounding_mode
;
594 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
595 if ( roundingPrecision
== 80 ) goto precision80
;
596 if ( roundingPrecision
== 64 ) {
597 roundIncrement
= LIT64( 0x0000000000000400 );
598 roundMask
= LIT64( 0x00000000000007FF );
600 else if ( roundingPrecision
== 32 ) {
601 roundIncrement
= LIT64( 0x0000008000000000 );
602 roundMask
= LIT64( 0x000000FFFFFFFFFF );
607 zSig0
|= ( zSig1
!= 0 );
608 if ( ! roundNearestEven
) {
609 if ( roundingMode
== float_round_to_zero
) {
613 roundIncrement
= roundMask
;
615 if ( roundingMode
== float_round_up
) roundIncrement
= 0;
618 if ( roundingMode
== float_round_down
) roundIncrement
= 0;
622 roundBits
= zSig0
& roundMask
;
623 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
624 if ( ( 0x7FFE < zExp
)
625 || ( ( zExp
== 0x7FFE ) && ( zSig0
+ roundIncrement
< zSig0
) )
631 ( float_detect_tininess
== float_tininess_before_rounding
)
633 || ( zSig0
<= zSig0
+ roundIncrement
);
634 shift64RightJamming( zSig0
, 1 - zExp
, &zSig0
);
636 roundBits
= zSig0
& roundMask
;
637 if ( isTiny
&& roundBits
) float_raise( float_flag_underflow
);
638 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
639 zSig0
+= roundIncrement
;
640 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
641 roundIncrement
= roundMask
+ 1;
642 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
643 roundMask
|= roundIncrement
;
645 zSig0
&= ~ roundMask
;
646 return packFloatx80( zSign
, zExp
, zSig0
);
649 if ( roundBits
) float_exception_flags
|= float_flag_inexact
;
650 zSig0
+= roundIncrement
;
651 if ( zSig0
< roundIncrement
) {
653 zSig0
= LIT64( 0x8000000000000000 );
655 roundIncrement
= roundMask
+ 1;
656 if ( roundNearestEven
&& ( roundBits
<<1 == roundIncrement
) ) {
657 roundMask
|= roundIncrement
;
659 zSig0
&= ~ roundMask
;
660 if ( zSig0
== 0 ) zExp
= 0;
661 return packFloatx80( zSign
, zExp
, zSig0
);
663 increment
= ( (sbits64
) zSig1
< 0 );
664 if ( ! roundNearestEven
) {
665 if ( roundingMode
== float_round_to_zero
) {
670 increment
= ( roundingMode
== float_round_down
) && zSig1
;
673 increment
= ( roundingMode
== float_round_up
) && zSig1
;
677 if ( 0x7FFD <= (bits32
) ( zExp
- 1 ) ) {
678 if ( ( 0x7FFE < zExp
)
679 || ( ( zExp
== 0x7FFE )
680 && ( zSig0
== LIT64( 0xFFFFFFFFFFFFFFFF ) )
686 float_raise( float_flag_overflow
| float_flag_inexact
);
687 if ( ( roundingMode
== float_round_to_zero
)
688 || ( zSign
&& ( roundingMode
== float_round_up
) )
689 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
691 return packFloatx80( zSign
, 0x7FFE, ~ roundMask
);
693 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
697 ( float_detect_tininess
== float_tininess_before_rounding
)
700 || ( zSig0
< LIT64( 0xFFFFFFFFFFFFFFFF ) );
701 shift64ExtraRightJamming( zSig0
, zSig1
, 1 - zExp
, &zSig0
, &zSig1
);
703 if ( isTiny
&& zSig1
) float_raise( float_flag_underflow
);
704 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
705 if ( roundNearestEven
) {
706 increment
= ( (sbits64
) zSig1
< 0 );
710 increment
= ( roundingMode
== float_round_down
) && zSig1
;
713 increment
= ( roundingMode
== float_round_up
) && zSig1
;
719 ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
720 if ( (sbits64
) zSig0
< 0 ) zExp
= 1;
722 return packFloatx80( zSign
, zExp
, zSig0
);
725 if ( zSig1
) float_exception_flags
|= float_flag_inexact
;
730 zSig0
= LIT64( 0x8000000000000000 );
733 zSig0
&= ~ ( ( (bits64
) ( zSig1
<<1 ) == 0 ) & roundNearestEven
);
737 if ( zSig0
== 0 ) zExp
= 0;
739 return packFloatx80( zSign
, zExp
, zSig0
);
743 /*----------------------------------------------------------------------------
744 | Takes an abstract floating-point value having sign `zSign', exponent
745 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
746 | and returns the proper extended double-precision floating-point value
747 | corresponding to the abstract input. This routine is just like
748 | `roundAndPackFloatx80' except that the input significand does not have to be
750 *----------------------------------------------------------------------------*/
753 normalizeRoundAndPackFloatx80(
754 int8 roundingPrecision
, flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
764 shiftCount
= countLeadingZeros64( zSig0
);
765 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
768 roundAndPackFloatx80( roundingPrecision
, zSign
, zExp
, zSig0
, zSig1
);
776 /*----------------------------------------------------------------------------
777 | Returns the least-significant 64 fraction bits of the quadruple-precision
778 | floating-point value `a'.
779 *----------------------------------------------------------------------------*/
781 INLINE bits64
extractFloat128Frac1( float128 a
)
788 /*----------------------------------------------------------------------------
789 | Returns the most-significant 48 fraction bits of the quadruple-precision
790 | floating-point value `a'.
791 *----------------------------------------------------------------------------*/
793 INLINE bits64
extractFloat128Frac0( float128 a
)
796 return a
.high
& LIT64( 0x0000FFFFFFFFFFFF );
800 /*----------------------------------------------------------------------------
801 | Returns the exponent bits of the quadruple-precision floating-point value
803 *----------------------------------------------------------------------------*/
805 INLINE int32
extractFloat128Exp( float128 a
)
808 return ( a
.high
>>48 ) & 0x7FFF;
812 /*----------------------------------------------------------------------------
813 | Returns the sign bit of the quadruple-precision floating-point value `a'.
814 *----------------------------------------------------------------------------*/
816 INLINE flag
extractFloat128Sign( float128 a
)
823 /*----------------------------------------------------------------------------
824 | Normalizes the subnormal quadruple-precision floating-point value
825 | represented by the denormalized significand formed by the concatenation of
826 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
827 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
828 | significand are stored at the location pointed to by `zSig0Ptr', and the
829 | least significant 64 bits of the normalized significand are stored at the
830 | location pointed to by `zSig1Ptr'.
831 *----------------------------------------------------------------------------*/
834 normalizeFloat128Subnormal(
845 shiftCount
= countLeadingZeros64( aSig1
) - 15;
846 if ( shiftCount
< 0 ) {
847 *zSig0Ptr
= aSig1
>>( - shiftCount
);
848 *zSig1Ptr
= aSig1
<<( shiftCount
& 63 );
851 *zSig0Ptr
= aSig1
<<shiftCount
;
854 *zExpPtr
= - shiftCount
- 63;
857 shiftCount
= countLeadingZeros64( aSig0
) - 15;
858 shortShift128Left( aSig0
, aSig1
, shiftCount
, zSig0Ptr
, zSig1Ptr
);
859 *zExpPtr
= 1 - shiftCount
;
864 /*----------------------------------------------------------------------------
865 | Packs the sign `zSign', the exponent `zExp', and the significand formed
866 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
867 | floating-point value, returning the result. After being shifted into the
868 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
869 | added together to form the most significant 32 bits of the result. This
870 | means that any integer portion of `zSig0' will be added into the exponent.
871 | Since a properly normalized significand will have an integer portion equal
872 | to 1, the `zExp' input should be 1 less than the desired result exponent
873 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
875 *----------------------------------------------------------------------------*/
878 packFloat128( flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
883 z
.high
= ( ( (bits64
) zSign
)<<63 ) + ( ( (bits64
) zExp
)<<48 ) + zSig0
;
888 /*----------------------------------------------------------------------------
889 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
890 | and extended significand formed by the concatenation of `zSig0', `zSig1',
891 | and `zSig2', and returns the proper quadruple-precision floating-point value
892 | corresponding to the abstract input. Ordinarily, the abstract value is
893 | simply rounded and packed into the quadruple-precision format, with the
894 | inexact exception raised if the abstract input cannot be represented
895 | exactly. However, if the abstract value is too large, the overflow and
896 | inexact exceptions are raised and an infinity or maximal finite value is
897 | returned. If the abstract value is too small, the input value is rounded to
898 | a subnormal number, and the underflow and inexact exceptions are raised if
899 | the abstract input cannot be represented exactly as a subnormal quadruple-
900 | precision floating-point number.
901 | The input significand must be normalized or smaller. If the input
902 | significand is not normalized, `zExp' must be 0; in that case, the result
903 | returned is a subnormal number, and it must not require rounding. In the
904 | usual case that the input significand is normalized, `zExp' must be 1 less
905 | than the ``true'' floating-point exponent. The handling of underflow and
906 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
907 *----------------------------------------------------------------------------*/
910 roundAndPackFloat128(
911 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
, bits64 zSig2
)
914 flag roundNearestEven
, increment
, isTiny
;
916 roundingMode
= float_rounding_mode
;
917 roundNearestEven
= ( roundingMode
== float_round_nearest_even
);
918 increment
= ( (sbits64
) zSig2
< 0 );
919 if ( ! roundNearestEven
) {
920 if ( roundingMode
== float_round_to_zero
) {
925 increment
= ( roundingMode
== float_round_down
) && zSig2
;
928 increment
= ( roundingMode
== float_round_up
) && zSig2
;
932 if ( 0x7FFD <= (bits32
) zExp
) {
933 if ( ( 0x7FFD < zExp
)
934 || ( ( zExp
== 0x7FFD )
936 LIT64( 0x0001FFFFFFFFFFFF ),
937 LIT64( 0xFFFFFFFFFFFFFFFF ),
944 float_raise( float_flag_overflow
| float_flag_inexact
);
945 if ( ( roundingMode
== float_round_to_zero
)
946 || ( zSign
&& ( roundingMode
== float_round_up
) )
947 || ( ! zSign
&& ( roundingMode
== float_round_down
) )
953 LIT64( 0x0000FFFFFFFFFFFF ),
954 LIT64( 0xFFFFFFFFFFFFFFFF )
957 return packFloat128( zSign
, 0x7FFF, 0, 0 );
961 ( float_detect_tininess
== float_tininess_before_rounding
)
967 LIT64( 0x0001FFFFFFFFFFFF ),
968 LIT64( 0xFFFFFFFFFFFFFFFF )
970 shift128ExtraRightJamming(
971 zSig0
, zSig1
, zSig2
, - zExp
, &zSig0
, &zSig1
, &zSig2
);
973 if ( isTiny
&& zSig2
) float_raise( float_flag_underflow
);
974 if ( roundNearestEven
) {
975 increment
= ( (sbits64
) zSig2
< 0 );
979 increment
= ( roundingMode
== float_round_down
) && zSig2
;
982 increment
= ( roundingMode
== float_round_up
) && zSig2
;
987 if ( zSig2
) float_exception_flags
|= float_flag_inexact
;
989 add128( zSig0
, zSig1
, 0, 1, &zSig0
, &zSig1
);
990 zSig1
&= ~ ( ( zSig2
+ zSig2
== 0 ) & roundNearestEven
);
993 if ( ( zSig0
| zSig1
) == 0 ) zExp
= 0;
995 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
999 /*----------------------------------------------------------------------------
1000 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1001 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1002 | returns the proper quadruple-precision floating-point value corresponding
1003 | to the abstract input. This routine is just like `roundAndPackFloat128'
1004 | except that the input significand has fewer bits and does not have to be
1005 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1007 *----------------------------------------------------------------------------*/
1010 normalizeRoundAndPackFloat128(
1011 flag zSign
, int32 zExp
, bits64 zSig0
, bits64 zSig1
)
1021 shiftCount
= countLeadingZeros64( zSig0
) - 15;
1022 if ( 0 <= shiftCount
) {
1024 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1027 shift128ExtraRightJamming(
1028 zSig0
, zSig1
, 0, - shiftCount
, &zSig0
, &zSig1
, &zSig2
);
1031 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
1037 /*----------------------------------------------------------------------------
1038 | Returns the result of converting the 32-bit two's complement integer `a'
1039 | to the single-precision floating-point format. The conversion is performed
1040 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1043 float32
int32_to_float32( int32 a
)
1047 if ( a
== 0 ) return 0;
1048 if ( a
== (sbits32
) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1050 return normalizeRoundAndPackFloat32( zSign
, 0x9C, zSign
? - a
: a
);
1054 /*----------------------------------------------------------------------------
1055 | Returns the result of converting the 32-bit two's complement integer `a'
1056 | to the double-precision floating-point format. The conversion is performed
1057 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1058 *----------------------------------------------------------------------------*/
1060 float64
int32_to_float64( int32 a
)
1067 if ( a
== 0 ) return 0;
1069 absA
= zSign
? - a
: a
;
1070 shiftCount
= countLeadingZeros32( absA
) + 21;
1072 return packFloat64( zSign
, 0x432 - shiftCount
, zSig
<<shiftCount
);
1078 /*----------------------------------------------------------------------------
1079 | Returns the result of converting the 32-bit two's complement integer `a'
1080 | to the extended double-precision floating-point format. The conversion
1081 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1083 *----------------------------------------------------------------------------*/
1085 floatx80
int32_to_floatx80( int32 a
)
1092 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1094 absA
= zSign
? - a
: a
;
1095 shiftCount
= countLeadingZeros32( absA
) + 32;
1097 return packFloatx80( zSign
, 0x403E - shiftCount
, zSig
<<shiftCount
);
1105 /*----------------------------------------------------------------------------
1106 | Returns the result of converting the 32-bit two's complement integer `a' to
1107 | the quadruple-precision floating-point format. The conversion is performed
1108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1109 *----------------------------------------------------------------------------*/
1111 float128
int32_to_float128( int32 a
)
1118 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1120 absA
= zSign
? - a
: a
;
1121 shiftCount
= countLeadingZeros32( absA
) + 17;
1123 return packFloat128( zSign
, 0x402E - shiftCount
, zSig0
<<shiftCount
, 0 );
1129 /*----------------------------------------------------------------------------
1130 | Returns the result of converting the 64-bit two's complement integer `a'
1131 | to the single-precision floating-point format. The conversion is performed
1132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1133 *----------------------------------------------------------------------------*/
1135 float32
int64_to_float32( int64 a
)
1142 if ( a
== 0 ) return 0;
1144 absA
= zSign
? - a
: a
;
1145 shiftCount
= countLeadingZeros64( absA
) - 40;
1146 if ( 0 <= shiftCount
) {
1147 return packFloat32( zSign
, 0x95 - shiftCount
, absA
<<shiftCount
);
1151 if ( shiftCount
< 0 ) {
1152 shift64RightJamming( absA
, - shiftCount
, &absA
);
1155 absA
<<= shiftCount
;
1157 return roundAndPackFloat32( zSign
, 0x9C - shiftCount
, absA
);
1162 /*----------------------------------------------------------------------------
1163 | Returns the result of converting the 64-bit two's complement integer `a'
1164 | to the double-precision floating-point format. The conversion is performed
1165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1166 *----------------------------------------------------------------------------*/
1168 float64
int64_to_float64( int64 a
)
1172 if ( a
== 0 ) return 0;
1173 if ( a
== (sbits64
) LIT64( 0x8000000000000000 ) ) {
1174 return packFloat64( 1, 0x43E, 0 );
1177 return normalizeRoundAndPackFloat64( zSign
, 0x43C, zSign
? - a
: a
);
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 64-bit two's complement integer `a'
1185 | to the extended double-precision floating-point format. The conversion
1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1188 *----------------------------------------------------------------------------*/
1190 floatx80
int64_to_floatx80( int64 a
)
1196 if ( a
== 0 ) return packFloatx80( 0, 0, 0 );
1198 absA
= zSign
? - a
: a
;
1199 shiftCount
= countLeadingZeros64( absA
);
1200 return packFloatx80( zSign
, 0x403E - shiftCount
, absA
<<shiftCount
);
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a' to
1210 | the quadruple-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1214 float128
int64_to_float128( int64 a
)
1220 bits64 zSig0
, zSig1
;
1222 if ( a
== 0 ) return packFloat128( 0, 0, 0, 0 );
1224 absA
= zSign
? - a
: a
;
1225 shiftCount
= countLeadingZeros64( absA
) + 49;
1226 zExp
= 0x406E - shiftCount
;
1227 if ( 64 <= shiftCount
) {
1236 shortShift128Left( zSig0
, zSig1
, shiftCount
, &zSig0
, &zSig1
);
1237 return packFloat128( zSign
, zExp
, zSig0
, zSig1
);
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the single-precision floating-point value
1245 | `a' to the 32-bit two's complement integer format. The conversion is
1246 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1247 | Arithmetic---which means in particular that the conversion is rounded
1248 | according to the current rounding mode. If `a' is a NaN, the largest
1249 | positive integer is returned. Otherwise, if the conversion overflows, the
1250 | largest integer with the same sign as `a' is returned.
1251 *----------------------------------------------------------------------------*/
1253 int32
float32_to_int32( float32 a
)
1256 int16 aExp
, shiftCount
;
1260 aSig
= extractFloat32Frac( a
);
1261 aExp
= extractFloat32Exp( a
);
1262 aSign
= extractFloat32Sign( a
);
1263 if ( ( aExp
== 0xFF ) && aSig
) aSign
= 0;
1264 if ( aExp
) aSig
|= 0x00800000;
1265 shiftCount
= 0xAF - aExp
;
1268 if ( 0 < shiftCount
) shift64RightJamming( aSig64
, shiftCount
, &aSig64
);
1269 return roundAndPackInt32( aSign
, aSig64
);
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the single-precision floating-point value
1275 | `a' to the 32-bit two's complement integer format. The conversion is
1276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 | Arithmetic, except that the conversion is always rounded toward zero.
1278 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1279 | the conversion overflows, the largest integer with the same sign as `a' is
1281 *----------------------------------------------------------------------------*/
1283 int32
float32_to_int32_round_to_zero( float32 a
)
1286 int16 aExp
, shiftCount
;
1290 aSig
= extractFloat32Frac( a
);
1291 aExp
= extractFloat32Exp( a
);
1292 aSign
= extractFloat32Sign( a
);
1293 shiftCount
= aExp
- 0x9E;
1294 if ( 0 <= shiftCount
) {
1295 if ( a
!= 0xCF000000 ) {
1296 float_raise( float_flag_invalid
);
1297 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) return 0x7FFFFFFF;
1299 return (sbits32
) 0x80000000;
1301 else if ( aExp
<= 0x7E ) {
1302 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1305 aSig
= ( aSig
| 0x00800000 )<<8;
1306 z
= aSig
>>( - shiftCount
);
1307 if ( (bits32
) ( aSig
<<( shiftCount
& 31 ) ) ) {
1308 float_exception_flags
|= float_flag_inexact
;
1310 if ( aSign
) z
= - z
;
1315 /*----------------------------------------------------------------------------
1316 | Returns the result of converting the single-precision floating-point value
1317 | `a' to the 64-bit two's complement integer format. The conversion is
1318 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1319 | Arithmetic---which means in particular that the conversion is rounded
1320 | according to the current rounding mode. If `a' is a NaN, the largest
1321 | positive integer is returned. Otherwise, if the conversion overflows, the
1322 | largest integer with the same sign as `a' is returned.
1323 *----------------------------------------------------------------------------*/
1325 int64
float32_to_int64( float32 a
)
1328 int16 aExp
, shiftCount
;
1330 bits64 aSig64
, aSigExtra
;
1332 aSig
= extractFloat32Frac( a
);
1333 aExp
= extractFloat32Exp( a
);
1334 aSign
= extractFloat32Sign( a
);
1335 shiftCount
= 0xBE - aExp
;
1336 if ( shiftCount
< 0 ) {
1337 float_raise( float_flag_invalid
);
1338 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1339 return LIT64( 0x7FFFFFFFFFFFFFFF );
1341 return (sbits64
) LIT64( 0x8000000000000000 );
1343 if ( aExp
) aSig
|= 0x00800000;
1346 shift64ExtraRightJamming( aSig64
, 0, shiftCount
, &aSig64
, &aSigExtra
);
1347 return roundAndPackInt64( aSign
, aSig64
, aSigExtra
);
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the single-precision floating-point value
1353 | `a' to the 64-bit two's complement integer format. The conversion is
1354 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1355 | Arithmetic, except that the conversion is always rounded toward zero. If
1356 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1357 | conversion overflows, the largest integer with the same sign as `a' is
1359 *----------------------------------------------------------------------------*/
1361 int64
float32_to_int64_round_to_zero( float32 a
)
1364 int16 aExp
, shiftCount
;
1369 aSig
= extractFloat32Frac( a
);
1370 aExp
= extractFloat32Exp( a
);
1371 aSign
= extractFloat32Sign( a
);
1372 shiftCount
= aExp
- 0xBE;
1373 if ( 0 <= shiftCount
) {
1374 if ( a
!= 0xDF000000 ) {
1375 float_raise( float_flag_invalid
);
1376 if ( ! aSign
|| ( ( aExp
== 0xFF ) && aSig
) ) {
1377 return LIT64( 0x7FFFFFFFFFFFFFFF );
1380 return (sbits64
) LIT64( 0x8000000000000000 );
1382 else if ( aExp
<= 0x7E ) {
1383 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
1386 aSig64
= aSig
| 0x00800000;
1388 z
= aSig64
>>( - shiftCount
);
1389 if ( (bits64
) ( aSig64
<<( shiftCount
& 63 ) ) ) {
1390 float_exception_flags
|= float_flag_inexact
;
1392 if ( aSign
) z
= - z
;
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the double-precision floating-point format. The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1402 *----------------------------------------------------------------------------*/
1404 float64
float32_to_float64( float32 a
)
1410 aSig
= extractFloat32Frac( a
);
1411 aExp
= extractFloat32Exp( a
);
1412 aSign
= extractFloat32Sign( a
);
1413 if ( aExp
== 0xFF ) {
1414 if ( aSig
) return commonNaNToFloat64( float32ToCommonNaN( a
) );
1415 return packFloat64( aSign
, 0x7FF, 0 );
1418 if ( aSig
== 0 ) return packFloat64( aSign
, 0, 0 );
1419 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1422 return packFloat64( aSign
, aExp
+ 0x380, ( (bits64
) aSig
)<<29 );
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the single-precision floating-point value
1430 | `a' to the extended double-precision floating-point format. The conversion
1431 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1433 *----------------------------------------------------------------------------*/
1435 floatx80
float32_to_floatx80( float32 a
)
1441 aSig
= extractFloat32Frac( a
);
1442 aExp
= extractFloat32Exp( a
);
1443 aSign
= extractFloat32Sign( a
);
1444 if ( aExp
== 0xFF ) {
1445 if ( aSig
) return commonNaNToFloatx80( float32ToCommonNaN( a
) );
1446 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
1449 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
1450 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1453 return packFloatx80( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<40 );
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the double-precision floating-point format. The conversion is
1464 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1466 *----------------------------------------------------------------------------*/
1468 float128
float32_to_float128( float32 a
)
1474 aSig
= extractFloat32Frac( a
);
1475 aExp
= extractFloat32Exp( a
);
1476 aSign
= extractFloat32Sign( a
);
1477 if ( aExp
== 0xFF ) {
1478 if ( aSig
) return commonNaNToFloat128( float32ToCommonNaN( a
) );
1479 return packFloat128( aSign
, 0x7FFF, 0, 0 );
1482 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
1483 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1486 return packFloat128( aSign
, aExp
+ 0x3F80, ( (bits64
) aSig
)<<25, 0 );
1492 /*----------------------------------------------------------------------------
1493 | Rounds the single-precision floating-point value `a' to an integer, and
1494 | returns the result as a single-precision floating-point value. The
1495 | operation is performed according to the IEC/IEEE Standard for Binary
1496 | Floating-Point Arithmetic.
1497 *----------------------------------------------------------------------------*/
1499 float32
float32_round_to_int( float32 a
)
1503 bits32 lastBitMask
, roundBitsMask
;
1507 aExp
= extractFloat32Exp( a
);
1508 if ( 0x96 <= aExp
) {
1509 if ( ( aExp
== 0xFF ) && extractFloat32Frac( a
) ) {
1510 return propagateFloat32NaN( a
, a
);
1514 if ( aExp
<= 0x7E ) {
1515 if ( (bits32
) ( a
<<1 ) == 0 ) return a
;
1516 float_exception_flags
|= float_flag_inexact
;
1517 aSign
= extractFloat32Sign( a
);
1518 switch ( float_rounding_mode
) {
1519 case float_round_nearest_even
:
1520 if ( ( aExp
== 0x7E ) && extractFloat32Frac( a
) ) {
1521 return packFloat32( aSign
, 0x7F, 0 );
1524 case float_round_down
:
1525 return aSign
? 0xBF800000 : 0;
1526 case float_round_up
:
1527 return aSign
? 0x80000000 : 0x3F800000;
1529 return packFloat32( aSign
, 0, 0 );
1532 lastBitMask
<<= 0x96 - aExp
;
1533 roundBitsMask
= lastBitMask
- 1;
1535 roundingMode
= float_rounding_mode
;
1536 if ( roundingMode
== float_round_nearest_even
) {
1537 z
+= lastBitMask
>>1;
1538 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
1540 else if ( roundingMode
!= float_round_to_zero
) {
1541 if ( extractFloat32Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
1545 z
&= ~ roundBitsMask
;
1546 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of adding the absolute values of the single-precision
1553 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1554 | before being returned. `zSign' is ignored if the result is a NaN.
1555 | The addition is performed according to the IEC/IEEE Standard for Binary
1556 | Floating-Point Arithmetic.
1557 *----------------------------------------------------------------------------*/
1559 static float32
addFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1561 int16 aExp
, bExp
, zExp
;
1562 bits32 aSig
, bSig
, zSig
;
1565 aSig
= extractFloat32Frac( a
);
1566 aExp
= extractFloat32Exp( a
);
1567 bSig
= extractFloat32Frac( b
);
1568 bExp
= extractFloat32Exp( b
);
1569 expDiff
= aExp
- bExp
;
1572 if ( 0 < expDiff
) {
1573 if ( aExp
== 0xFF ) {
1574 if ( aSig
) return propagateFloat32NaN( a
, b
);
1583 shift32RightJamming( bSig
, expDiff
, &bSig
);
1586 else if ( expDiff
< 0 ) {
1587 if ( bExp
== 0xFF ) {
1588 if ( bSig
) return propagateFloat32NaN( a
, b
);
1589 return packFloat32( zSign
, 0xFF, 0 );
1597 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1601 if ( aExp
== 0xFF ) {
1602 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1605 if ( aExp
== 0 ) return packFloat32( zSign
, 0, ( aSig
+ bSig
)>>6 );
1606 zSig
= 0x40000000 + aSig
+ bSig
;
1611 zSig
= ( aSig
+ bSig
)<<1;
1613 if ( (sbits32
) zSig
< 0 ) {
1618 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of subtracting the absolute values of the single-
1624 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1625 | difference is negated before being returned. `zSign' is ignored if the
1626 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1627 | Standard for Binary Floating-Point Arithmetic.
1628 *----------------------------------------------------------------------------*/
1630 static float32
subFloat32Sigs( float32 a
, float32 b
, flag zSign
)
1632 int16 aExp
, bExp
, zExp
;
1633 bits32 aSig
, bSig
, zSig
;
1636 aSig
= extractFloat32Frac( a
);
1637 aExp
= extractFloat32Exp( a
);
1638 bSig
= extractFloat32Frac( b
);
1639 bExp
= extractFloat32Exp( b
);
1640 expDiff
= aExp
- bExp
;
1643 if ( 0 < expDiff
) goto aExpBigger
;
1644 if ( expDiff
< 0 ) goto bExpBigger
;
1645 if ( aExp
== 0xFF ) {
1646 if ( aSig
| bSig
) return propagateFloat32NaN( a
, b
);
1647 float_raise( float_flag_invalid
);
1648 return float32_default_nan
;
1654 if ( bSig
< aSig
) goto aBigger
;
1655 if ( aSig
< bSig
) goto bBigger
;
1656 return packFloat32( float_rounding_mode
== float_round_down
, 0, 0 );
1658 if ( bExp
== 0xFF ) {
1659 if ( bSig
) return propagateFloat32NaN( a
, b
);
1660 return packFloat32( zSign
^ 1, 0xFF, 0 );
1668 shift32RightJamming( aSig
, - expDiff
, &aSig
);
1674 goto normalizeRoundAndPack
;
1676 if ( aExp
== 0xFF ) {
1677 if ( aSig
) return propagateFloat32NaN( a
, b
);
1686 shift32RightJamming( bSig
, expDiff
, &bSig
);
1691 normalizeRoundAndPack
:
1693 return normalizeRoundAndPackFloat32( zSign
, zExp
, zSig
);
1697 /*----------------------------------------------------------------------------
1698 | Returns the result of adding the single-precision floating-point values `a'
1699 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1700 | Binary Floating-Point Arithmetic.
1701 *----------------------------------------------------------------------------*/
1703 float32
float32_add( float32 a
, float32 b
)
1707 aSign
= extractFloat32Sign( a
);
1708 bSign
= extractFloat32Sign( b
);
1709 if ( aSign
== bSign
) {
1710 return addFloat32Sigs( a
, b
, aSign
);
1713 return subFloat32Sigs( a
, b
, aSign
);
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of subtracting the single-precision floating-point values
1720 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1721 | for Binary Floating-Point Arithmetic.
1722 *----------------------------------------------------------------------------*/
1724 float32
float32_sub( float32 a
, float32 b
)
1728 aSign
= extractFloat32Sign( a
);
1729 bSign
= extractFloat32Sign( b
);
1730 if ( aSign
== bSign
) {
1731 return subFloat32Sigs( a
, b
, aSign
);
1734 return addFloat32Sigs( a
, b
, aSign
);
1739 /*----------------------------------------------------------------------------
1740 | Returns the result of multiplying the single-precision floating-point values
1741 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1742 | for Binary Floating-Point Arithmetic.
1743 *----------------------------------------------------------------------------*/
1745 float32
float32_mul( float32 a
, float32 b
)
1747 flag aSign
, bSign
, zSign
;
1748 int16 aExp
, bExp
, zExp
;
1753 aSig
= extractFloat32Frac( a
);
1754 aExp
= extractFloat32Exp( a
);
1755 aSign
= extractFloat32Sign( a
);
1756 bSig
= extractFloat32Frac( b
);
1757 bExp
= extractFloat32Exp( b
);
1758 bSign
= extractFloat32Sign( b
);
1759 zSign
= aSign
^ bSign
;
1760 if ( aExp
== 0xFF ) {
1761 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1762 return propagateFloat32NaN( a
, b
);
1764 if ( ( bExp
| bSig
) == 0 ) {
1765 float_raise( float_flag_invalid
);
1766 return float32_default_nan
;
1768 return packFloat32( zSign
, 0xFF, 0 );
1770 if ( bExp
== 0xFF ) {
1771 if ( bSig
) return propagateFloat32NaN( a
, b
);
1772 if ( ( aExp
| aSig
) == 0 ) {
1773 float_raise( float_flag_invalid
);
1774 return float32_default_nan
;
1776 return packFloat32( zSign
, 0xFF, 0 );
1779 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1780 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1783 if ( bSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1784 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1786 zExp
= aExp
+ bExp
- 0x7F;
1787 aSig
= ( aSig
| 0x00800000 )<<7;
1788 bSig
= ( bSig
| 0x00800000 )<<8;
1789 shift64RightJamming( ( (bits64
) aSig
) * bSig
, 32, &zSig64
);
1791 if ( 0 <= (sbits32
) ( zSig
<<1 ) ) {
1795 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1799 /*----------------------------------------------------------------------------
1800 | Returns the result of dividing the single-precision floating-point value `a'
1801 | by the corresponding value `b'. The operation is performed according to the
1802 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1803 *----------------------------------------------------------------------------*/
1805 float32
float32_div( float32 a
, float32 b
)
1807 flag aSign
, bSign
, zSign
;
1808 int16 aExp
, bExp
, zExp
;
1809 bits32 aSig
, bSig
, zSig
;
1811 aSig
= extractFloat32Frac( a
);
1812 aExp
= extractFloat32Exp( a
);
1813 aSign
= extractFloat32Sign( a
);
1814 bSig
= extractFloat32Frac( b
);
1815 bExp
= extractFloat32Exp( b
);
1816 bSign
= extractFloat32Sign( b
);
1817 zSign
= aSign
^ bSign
;
1818 if ( aExp
== 0xFF ) {
1819 if ( aSig
) return propagateFloat32NaN( a
, b
);
1820 if ( bExp
== 0xFF ) {
1821 if ( bSig
) return propagateFloat32NaN( a
, b
);
1822 float_raise( float_flag_invalid
);
1823 return float32_default_nan
;
1825 return packFloat32( zSign
, 0xFF, 0 );
1827 if ( bExp
== 0xFF ) {
1828 if ( bSig
) return propagateFloat32NaN( a
, b
);
1829 return packFloat32( zSign
, 0, 0 );
1833 if ( ( aExp
| aSig
) == 0 ) {
1834 float_raise( float_flag_invalid
);
1835 return float32_default_nan
;
1837 float_raise( float_flag_divbyzero
);
1838 return packFloat32( zSign
, 0xFF, 0 );
1840 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1843 if ( aSig
== 0 ) return packFloat32( zSign
, 0, 0 );
1844 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1846 zExp
= aExp
- bExp
+ 0x7D;
1847 aSig
= ( aSig
| 0x00800000 )<<7;
1848 bSig
= ( bSig
| 0x00800000 )<<8;
1849 if ( bSig
<= ( aSig
+ aSig
) ) {
1853 zSig
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1854 if ( ( zSig
& 0x3F ) == 0 ) {
1855 zSig
|= ( (bits64
) bSig
* zSig
!= ( (bits64
) aSig
)<<32 );
1857 return roundAndPackFloat32( zSign
, zExp
, zSig
);
1861 /*----------------------------------------------------------------------------
1862 | Returns the remainder of the single-precision floating-point value `a'
1863 | with respect to the corresponding value `b'. The operation is performed
1864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1867 float32
float32_rem( float32 a
, float32 b
)
1869 flag aSign
, bSign
, zSign
;
1870 int16 aExp
, bExp
, expDiff
;
1873 bits64 aSig64
, bSig64
, q64
;
1874 bits32 alternateASig
;
1877 aSig
= extractFloat32Frac( a
);
1878 aExp
= extractFloat32Exp( a
);
1879 aSign
= extractFloat32Sign( a
);
1880 bSig
= extractFloat32Frac( b
);
1881 bExp
= extractFloat32Exp( b
);
1882 bSign
= extractFloat32Sign( b
);
1883 if ( aExp
== 0xFF ) {
1884 if ( aSig
|| ( ( bExp
== 0xFF ) && bSig
) ) {
1885 return propagateFloat32NaN( a
, b
);
1887 float_raise( float_flag_invalid
);
1888 return float32_default_nan
;
1890 if ( bExp
== 0xFF ) {
1891 if ( bSig
) return propagateFloat32NaN( a
, b
);
1896 float_raise( float_flag_invalid
);
1897 return float32_default_nan
;
1899 normalizeFloat32Subnormal( bSig
, &bExp
, &bSig
);
1902 if ( aSig
== 0 ) return a
;
1903 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1905 expDiff
= aExp
- bExp
;
1908 if ( expDiff
< 32 ) {
1911 if ( expDiff
< 0 ) {
1912 if ( expDiff
< -1 ) return a
;
1915 q
= ( bSig
<= aSig
);
1916 if ( q
) aSig
-= bSig
;
1917 if ( 0 < expDiff
) {
1918 q
= ( ( (bits64
) aSig
)<<32 ) / bSig
;
1921 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
1929 if ( bSig
<= aSig
) aSig
-= bSig
;
1930 aSig64
= ( (bits64
) aSig
)<<40;
1931 bSig64
= ( (bits64
) bSig
)<<40;
1933 while ( 0 < expDiff
) {
1934 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1935 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1936 aSig64
= - ( ( bSig
* q64
)<<38 );
1940 q64
= estimateDiv128To64( aSig64
, 0, bSig64
);
1941 q64
= ( 2 < q64
) ? q64
- 2 : 0;
1942 q
= q64
>>( 64 - expDiff
);
1944 aSig
= ( ( aSig64
>>33 )<<( expDiff
- 1 ) ) - bSig
* q
;
1947 alternateASig
= aSig
;
1950 } while ( 0 <= (sbits32
) aSig
);
1951 sigMean
= aSig
+ alternateASig
;
1952 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
1953 aSig
= alternateASig
;
1955 zSign
= ( (sbits32
) aSig
< 0 );
1956 if ( zSign
) aSig
= - aSig
;
1957 return normalizeRoundAndPackFloat32( aSign
^ zSign
, bExp
, aSig
);
1961 /*----------------------------------------------------------------------------
1962 | Returns the square root of the single-precision floating-point value `a'.
1963 | The operation is performed according to the IEC/IEEE Standard for Binary
1964 | Floating-Point Arithmetic.
1965 *----------------------------------------------------------------------------*/
1967 float32
float32_sqrt( float32 a
)
1974 aSig
= extractFloat32Frac( a
);
1975 aExp
= extractFloat32Exp( a
);
1976 aSign
= extractFloat32Sign( a
);
1977 if ( aExp
== 0xFF ) {
1978 if ( aSig
) return propagateFloat32NaN( a
, 0 );
1979 if ( ! aSign
) return a
;
1980 float_raise( float_flag_invalid
);
1981 return float32_default_nan
;
1984 if ( ( aExp
| aSig
) == 0 ) return a
;
1985 float_raise( float_flag_invalid
);
1986 return float32_default_nan
;
1989 if ( aSig
== 0 ) return 0;
1990 normalizeFloat32Subnormal( aSig
, &aExp
, &aSig
);
1992 zExp
= ( ( aExp
- 0x7F )>>1 ) + 0x7E;
1993 aSig
= ( aSig
| 0x00800000 )<<8;
1994 zSig
= estimateSqrt32( aExp
, aSig
) + 2;
1995 if ( ( zSig
& 0x7F ) <= 5 ) {
2001 term
= ( (bits64
) zSig
) * zSig
;
2002 rem
= ( ( (bits64
) aSig
)<<32 ) - term
;
2003 while ( (sbits64
) rem
< 0 ) {
2005 rem
+= ( ( (bits64
) zSig
)<<1 ) | 1;
2007 zSig
|= ( rem
!= 0 );
2009 shift32RightJamming( zSig
, 1, &zSig
);
2011 return roundAndPackFloat32( 0, zExp
, zSig
);
2015 /*----------------------------------------------------------------------------
2016 | Returns 1 if the single-precision floating-point value `a' is equal to
2017 | the corresponding value `b', and 0 otherwise. The comparison is performed
2018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 *----------------------------------------------------------------------------*/
2021 flag
float32_eq( float32 a
, float32 b
)
2024 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2025 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2027 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2028 float_raise( float_flag_invalid
);
2032 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2036 /*----------------------------------------------------------------------------
2037 | Returns 1 if the single-precision floating-point value `a' is less than
2038 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2041 *----------------------------------------------------------------------------*/
2043 flag
float32_le( float32 a
, float32 b
)
2047 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2048 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2050 float_raise( float_flag_invalid
);
2053 aSign
= extractFloat32Sign( a
);
2054 bSign
= extractFloat32Sign( b
);
2055 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2056 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2060 /*----------------------------------------------------------------------------
2061 | Returns 1 if the single-precision floating-point value `a' is less than
2062 | the corresponding value `b', and 0 otherwise. The comparison is performed
2063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2064 *----------------------------------------------------------------------------*/
2066 flag
float32_lt( float32 a
, float32 b
)
2070 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2071 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2073 float_raise( float_flag_invalid
);
2076 aSign
= extractFloat32Sign( a
);
2077 bSign
= extractFloat32Sign( b
);
2078 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2079 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2083 /*----------------------------------------------------------------------------
2084 | Returns 1 if the single-precision floating-point value `a' is equal to
2085 | the corresponding value `b', and 0 otherwise. The invalid exception is
2086 | raised if either operand is a NaN. Otherwise, the comparison is performed
2087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088 *----------------------------------------------------------------------------*/
2090 flag
float32_eq_signaling( float32 a
, float32 b
)
2093 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2094 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2096 float_raise( float_flag_invalid
);
2099 return ( a
== b
) || ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2103 /*----------------------------------------------------------------------------
2104 | Returns 1 if the single-precision floating-point value `a' is less than or
2105 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2106 | cause an exception. Otherwise, the comparison is performed according to the
2107 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2108 *----------------------------------------------------------------------------*/
2110 flag
float32_le_quiet( float32 a
, float32 b
)
2115 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2116 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2118 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2119 float_raise( float_flag_invalid
);
2123 aSign
= extractFloat32Sign( a
);
2124 bSign
= extractFloat32Sign( b
);
2125 if ( aSign
!= bSign
) return aSign
|| ( (bits32
) ( ( a
| b
)<<1 ) == 0 );
2126 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2130 /*----------------------------------------------------------------------------
2131 | Returns 1 if the single-precision floating-point value `a' is less than
2132 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2133 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2134 | Standard for Binary Floating-Point Arithmetic.
2135 *----------------------------------------------------------------------------*/
2137 flag
float32_lt_quiet( float32 a
, float32 b
)
2141 if ( ( ( extractFloat32Exp( a
) == 0xFF ) && extractFloat32Frac( a
) )
2142 || ( ( extractFloat32Exp( b
) == 0xFF ) && extractFloat32Frac( b
) )
2144 if ( float32_is_signaling_nan( a
) || float32_is_signaling_nan( b
) ) {
2145 float_raise( float_flag_invalid
);
2149 aSign
= extractFloat32Sign( a
);
2150 bSign
= extractFloat32Sign( b
);
2151 if ( aSign
!= bSign
) return aSign
&& ( (bits32
) ( ( a
| b
)<<1 ) != 0 );
2152 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
2156 /*----------------------------------------------------------------------------
2157 | Returns the result of converting the double-precision floating-point value
2158 | `a' to the 32-bit two's complement integer format. The conversion is
2159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2160 | Arithmetic---which means in particular that the conversion is rounded
2161 | according to the current rounding mode. If `a' is a NaN, the largest
2162 | positive integer is returned. Otherwise, if the conversion overflows, the
2163 | largest integer with the same sign as `a' is returned.
2164 *----------------------------------------------------------------------------*/
2166 int32
float64_to_int32( float64 a
)
2169 int16 aExp
, shiftCount
;
2172 aSig
= extractFloat64Frac( a
);
2173 aExp
= extractFloat64Exp( a
);
2174 aSign
= extractFloat64Sign( a
);
2175 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2176 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2177 shiftCount
= 0x42C - aExp
;
2178 if ( 0 < shiftCount
) shift64RightJamming( aSig
, shiftCount
, &aSig
);
2179 return roundAndPackInt32( aSign
, aSig
);
2183 /*----------------------------------------------------------------------------
2184 | Returns the result of converting the double-precision floating-point value
2185 | `a' to the 32-bit two's complement integer format. The conversion is
2186 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2187 | Arithmetic, except that the conversion is always rounded toward zero.
2188 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2189 | the conversion overflows, the largest integer with the same sign as `a' is
2191 *----------------------------------------------------------------------------*/
2193 int32
float64_to_int32_round_to_zero( float64 a
)
2196 int16 aExp
, shiftCount
;
2197 bits64 aSig
, savedASig
;
2200 aSig
= extractFloat64Frac( a
);
2201 aExp
= extractFloat64Exp( a
);
2202 aSign
= extractFloat64Sign( a
);
2203 if ( 0x41E < aExp
) {
2204 if ( ( aExp
== 0x7FF ) && aSig
) aSign
= 0;
2207 else if ( aExp
< 0x3FF ) {
2208 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
2211 aSig
|= LIT64( 0x0010000000000000 );
2212 shiftCount
= 0x433 - aExp
;
2214 aSig
>>= shiftCount
;
2216 if ( aSign
) z
= - z
;
2217 if ( ( z
< 0 ) ^ aSign
) {
2219 float_raise( float_flag_invalid
);
2220 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
2222 if ( ( aSig
<<shiftCount
) != savedASig
) {
2223 float_exception_flags
|= float_flag_inexact
;
2229 /*----------------------------------------------------------------------------
2230 | Returns the result of converting the double-precision floating-point value
2231 | `a' to the 64-bit two's complement integer format. The conversion is
2232 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2233 | Arithmetic---which means in particular that the conversion is rounded
2234 | according to the current rounding mode. If `a' is a NaN, the largest
2235 | positive integer is returned. Otherwise, if the conversion overflows, the
2236 | largest integer with the same sign as `a' is returned.
2237 *----------------------------------------------------------------------------*/
2239 int64
float64_to_int64( float64 a
)
2242 int16 aExp
, shiftCount
;
2243 bits64 aSig
, aSigExtra
;
2245 aSig
= extractFloat64Frac( a
);
2246 aExp
= extractFloat64Exp( a
);
2247 aSign
= extractFloat64Sign( a
);
2248 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2249 shiftCount
= 0x433 - aExp
;
2250 if ( shiftCount
<= 0 ) {
2251 if ( 0x43E < aExp
) {
2252 float_raise( float_flag_invalid
);
2254 || ( ( aExp
== 0x7FF )
2255 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2257 return LIT64( 0x7FFFFFFFFFFFFFFF );
2259 return (sbits64
) LIT64( 0x8000000000000000 );
2262 aSig
<<= - shiftCount
;
2265 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
2267 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
2271 /*----------------------------------------------------------------------------
2272 | Returns the result of converting the double-precision floating-point value
2273 | `a' to the 64-bit two's complement integer format. The conversion is
2274 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2275 | Arithmetic, except that the conversion is always rounded toward zero.
2276 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2277 | the conversion overflows, the largest integer with the same sign as `a' is
2279 *----------------------------------------------------------------------------*/
2281 int64
float64_to_int64_round_to_zero( float64 a
)
2284 int16 aExp
, shiftCount
;
2288 aSig
= extractFloat64Frac( a
);
2289 aExp
= extractFloat64Exp( a
);
2290 aSign
= extractFloat64Sign( a
);
2291 if ( aExp
) aSig
|= LIT64( 0x0010000000000000 );
2292 shiftCount
= aExp
- 0x433;
2293 if ( 0 <= shiftCount
) {
2294 if ( 0x43E <= aExp
) {
2295 if ( a
!= LIT64( 0xC3E0000000000000 ) ) {
2296 float_raise( float_flag_invalid
);
2298 || ( ( aExp
== 0x7FF )
2299 && ( aSig
!= LIT64( 0x0010000000000000 ) ) )
2301 return LIT64( 0x7FFFFFFFFFFFFFFF );
2304 return (sbits64
) LIT64( 0x8000000000000000 );
2306 z
= aSig
<<shiftCount
;
2309 if ( aExp
< 0x3FE ) {
2310 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
2313 z
= aSig
>>( - shiftCount
);
2314 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
2315 float_exception_flags
|= float_flag_inexact
;
2318 if ( aSign
) z
= - z
;
2323 /*----------------------------------------------------------------------------
2324 | Returns the result of converting the double-precision floating-point value
2325 | `a' to the single-precision floating-point format. The conversion is
2326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2328 *----------------------------------------------------------------------------*/
2330 float32
float64_to_float32( float64 a
)
2337 aSig
= extractFloat64Frac( a
);
2338 aExp
= extractFloat64Exp( a
);
2339 aSign
= extractFloat64Sign( a
);
2340 if ( aExp
== 0x7FF ) {
2341 if ( aSig
) return commonNaNToFloat32( float64ToCommonNaN( a
) );
2342 return packFloat32( aSign
, 0xFF, 0 );
2344 shift64RightJamming( aSig
, 22, &aSig
);
2346 if ( aExp
|| zSig
) {
2350 return roundAndPackFloat32( aSign
, aExp
, zSig
);
2356 /*----------------------------------------------------------------------------
2357 | Returns the result of converting the double-precision floating-point value
2358 | `a' to the extended double-precision floating-point format. The conversion
2359 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2361 *----------------------------------------------------------------------------*/
2363 floatx80
float64_to_floatx80( float64 a
)
2369 aSig
= extractFloat64Frac( a
);
2370 aExp
= extractFloat64Exp( a
);
2371 aSign
= extractFloat64Sign( a
);
2372 if ( aExp
== 0x7FF ) {
2373 if ( aSig
) return commonNaNToFloatx80( float64ToCommonNaN( a
) );
2374 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
2377 if ( aSig
== 0 ) return packFloatx80( aSign
, 0, 0 );
2378 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2382 aSign
, aExp
+ 0x3C00, ( aSig
| LIT64( 0x0010000000000000 ) )<<11 );
2390 /*----------------------------------------------------------------------------
2391 | Returns the result of converting the double-precision floating-point value
2392 | `a' to the quadruple-precision floating-point format. The conversion is
2393 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2395 *----------------------------------------------------------------------------*/
2397 float128
float64_to_float128( float64 a
)
2401 bits64 aSig
, zSig0
, zSig1
;
2403 aSig
= extractFloat64Frac( a
);
2404 aExp
= extractFloat64Exp( a
);
2405 aSign
= extractFloat64Sign( a
);
2406 if ( aExp
== 0x7FF ) {
2407 if ( aSig
) return commonNaNToFloat128( float64ToCommonNaN( a
) );
2408 return packFloat128( aSign
, 0x7FFF, 0, 0 );
2411 if ( aSig
== 0 ) return packFloat128( aSign
, 0, 0, 0 );
2412 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2415 shift128Right( aSig
, 0, 4, &zSig0
, &zSig1
);
2416 return packFloat128( aSign
, aExp
+ 0x3C00, zSig0
, zSig1
);
2422 /*----------------------------------------------------------------------------
2423 | Rounds the double-precision floating-point value `a' to an integer, and
2424 | returns the result as a double-precision floating-point value. The
2425 | operation is performed according to the IEC/IEEE Standard for Binary
2426 | Floating-Point Arithmetic.
2427 *----------------------------------------------------------------------------*/
2429 float64
float64_round_to_int( float64 a
)
2433 bits64 lastBitMask
, roundBitsMask
;
2437 aExp
= extractFloat64Exp( a
);
2438 if ( 0x433 <= aExp
) {
2439 if ( ( aExp
== 0x7FF ) && extractFloat64Frac( a
) ) {
2440 return propagateFloat64NaN( a
, a
);
2444 if ( aExp
< 0x3FF ) {
2445 if ( (bits64
) ( a
<<1 ) == 0 ) return a
;
2446 float_exception_flags
|= float_flag_inexact
;
2447 aSign
= extractFloat64Sign( a
);
2448 switch ( float_rounding_mode
) {
2449 case float_round_nearest_even
:
2450 if ( ( aExp
== 0x3FE ) && extractFloat64Frac( a
) ) {
2451 return packFloat64( aSign
, 0x3FF, 0 );
2454 case float_round_down
:
2455 return aSign
? LIT64( 0xBFF0000000000000 ) : 0;
2456 case float_round_up
:
2458 aSign
? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2460 return packFloat64( aSign
, 0, 0 );
2463 lastBitMask
<<= 0x433 - aExp
;
2464 roundBitsMask
= lastBitMask
- 1;
2466 roundingMode
= float_rounding_mode
;
2467 if ( roundingMode
== float_round_nearest_even
) {
2468 z
+= lastBitMask
>>1;
2469 if ( ( z
& roundBitsMask
) == 0 ) z
&= ~ lastBitMask
;
2471 else if ( roundingMode
!= float_round_to_zero
) {
2472 if ( extractFloat64Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
2476 z
&= ~ roundBitsMask
;
2477 if ( z
!= a
) float_exception_flags
|= float_flag_inexact
;
2482 /*----------------------------------------------------------------------------
2483 | Returns the result of adding the absolute values of the double-precision
2484 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2485 | before being returned. `zSign' is ignored if the result is a NaN.
2486 | The addition is performed according to the IEC/IEEE Standard for Binary
2487 | Floating-Point Arithmetic.
2488 *----------------------------------------------------------------------------*/
2490 static float64
addFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2492 int16 aExp
, bExp
, zExp
;
2493 bits64 aSig
, bSig
, zSig
;
2496 aSig
= extractFloat64Frac( a
);
2497 aExp
= extractFloat64Exp( a
);
2498 bSig
= extractFloat64Frac( b
);
2499 bExp
= extractFloat64Exp( b
);
2500 expDiff
= aExp
- bExp
;
2503 if ( 0 < expDiff
) {
2504 if ( aExp
== 0x7FF ) {
2505 if ( aSig
) return propagateFloat64NaN( a
, b
);
2512 bSig
|= LIT64( 0x2000000000000000 );
2514 shift64RightJamming( bSig
, expDiff
, &bSig
);
2517 else if ( expDiff
< 0 ) {
2518 if ( bExp
== 0x7FF ) {
2519 if ( bSig
) return propagateFloat64NaN( a
, b
);
2520 return packFloat64( zSign
, 0x7FF, 0 );
2526 aSig
|= LIT64( 0x2000000000000000 );
2528 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2532 if ( aExp
== 0x7FF ) {
2533 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2536 if ( aExp
== 0 ) return packFloat64( zSign
, 0, ( aSig
+ bSig
)>>9 );
2537 zSig
= LIT64( 0x4000000000000000 ) + aSig
+ bSig
;
2541 aSig
|= LIT64( 0x2000000000000000 );
2542 zSig
= ( aSig
+ bSig
)<<1;
2544 if ( (sbits64
) zSig
< 0 ) {
2549 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of subtracting the absolute values of the double-
2555 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2556 | difference is negated before being returned. `zSign' is ignored if the
2557 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2558 | Standard for Binary Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2561 static float64
subFloat64Sigs( float64 a
, float64 b
, flag zSign
)
2563 int16 aExp
, bExp
, zExp
;
2564 bits64 aSig
, bSig
, zSig
;
2567 aSig
= extractFloat64Frac( a
);
2568 aExp
= extractFloat64Exp( a
);
2569 bSig
= extractFloat64Frac( b
);
2570 bExp
= extractFloat64Exp( b
);
2571 expDiff
= aExp
- bExp
;
2574 if ( 0 < expDiff
) goto aExpBigger
;
2575 if ( expDiff
< 0 ) goto bExpBigger
;
2576 if ( aExp
== 0x7FF ) {
2577 if ( aSig
| bSig
) return propagateFloat64NaN( a
, b
);
2578 float_raise( float_flag_invalid
);
2579 return float64_default_nan
;
2585 if ( bSig
< aSig
) goto aBigger
;
2586 if ( aSig
< bSig
) goto bBigger
;
2587 return packFloat64( float_rounding_mode
== float_round_down
, 0, 0 );
2589 if ( bExp
== 0x7FF ) {
2590 if ( bSig
) return propagateFloat64NaN( a
, b
);
2591 return packFloat64( zSign
^ 1, 0x7FF, 0 );
2597 aSig
|= LIT64( 0x4000000000000000 );
2599 shift64RightJamming( aSig
, - expDiff
, &aSig
);
2600 bSig
|= LIT64( 0x4000000000000000 );
2605 goto normalizeRoundAndPack
;
2607 if ( aExp
== 0x7FF ) {
2608 if ( aSig
) return propagateFloat64NaN( a
, b
);
2615 bSig
|= LIT64( 0x4000000000000000 );
2617 shift64RightJamming( bSig
, expDiff
, &bSig
);
2618 aSig
|= LIT64( 0x4000000000000000 );
2622 normalizeRoundAndPack
:
2624 return normalizeRoundAndPackFloat64( zSign
, zExp
, zSig
);
2628 /*----------------------------------------------------------------------------
2629 | Returns the result of adding the double-precision floating-point values `a'
2630 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2631 | Binary Floating-Point Arithmetic.
2632 *----------------------------------------------------------------------------*/
2634 float64
float64_add( float64 a
, float64 b
)
2638 aSign
= extractFloat64Sign( a
);
2639 bSign
= extractFloat64Sign( b
);
2640 if ( aSign
== bSign
) {
2641 return addFloat64Sigs( a
, b
, aSign
);
2644 return subFloat64Sigs( a
, b
, aSign
);
2649 /*----------------------------------------------------------------------------
2650 | Returns the result of subtracting the double-precision floating-point values
2651 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2652 | for Binary Floating-Point Arithmetic.
2653 *----------------------------------------------------------------------------*/
2655 float64
float64_sub( float64 a
, float64 b
)
2659 aSign
= extractFloat64Sign( a
);
2660 bSign
= extractFloat64Sign( b
);
2661 if ( aSign
== bSign
) {
2662 return subFloat64Sigs( a
, b
, aSign
);
2665 return addFloat64Sigs( a
, b
, aSign
);
2670 /*----------------------------------------------------------------------------
2671 | Returns the result of multiplying the double-precision floating-point values
2672 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2673 | for Binary Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2676 float64
float64_mul( float64 a
, float64 b
)
2678 flag aSign
, bSign
, zSign
;
2679 int16 aExp
, bExp
, zExp
;
2680 bits64 aSig
, bSig
, zSig0
, zSig1
;
2682 aSig
= extractFloat64Frac( a
);
2683 aExp
= extractFloat64Exp( a
);
2684 aSign
= extractFloat64Sign( a
);
2685 bSig
= extractFloat64Frac( b
);
2686 bExp
= extractFloat64Exp( b
);
2687 bSign
= extractFloat64Sign( b
);
2688 zSign
= aSign
^ bSign
;
2689 if ( aExp
== 0x7FF ) {
2690 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2691 return propagateFloat64NaN( a
, b
);
2693 if ( ( bExp
| bSig
) == 0 ) {
2694 float_raise( float_flag_invalid
);
2695 return float64_default_nan
;
2697 return packFloat64( zSign
, 0x7FF, 0 );
2699 if ( bExp
== 0x7FF ) {
2700 if ( bSig
) return propagateFloat64NaN( a
, b
);
2701 if ( ( aExp
| aSig
) == 0 ) {
2702 float_raise( float_flag_invalid
);
2703 return float64_default_nan
;
2705 return packFloat64( zSign
, 0x7FF, 0 );
2708 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2709 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2712 if ( bSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2713 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2715 zExp
= aExp
+ bExp
- 0x3FF;
2716 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2717 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2718 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
2719 zSig0
|= ( zSig1
!= 0 );
2720 if ( 0 <= (sbits64
) ( zSig0
<<1 ) ) {
2724 return roundAndPackFloat64( zSign
, zExp
, zSig0
);
2728 /*----------------------------------------------------------------------------
2729 | Returns the result of dividing the double-precision floating-point value `a'
2730 | by the corresponding value `b'. The operation is performed according to
2731 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2732 *----------------------------------------------------------------------------*/
2734 float64
float64_div( float64 a
, float64 b
)
2736 flag aSign
, bSign
, zSign
;
2737 int16 aExp
, bExp
, zExp
;
2738 bits64 aSig
, bSig
, zSig
;
2740 bits64 term0
, term1
;
2742 aSig
= extractFloat64Frac( a
);
2743 aExp
= extractFloat64Exp( a
);
2744 aSign
= extractFloat64Sign( a
);
2745 bSig
= extractFloat64Frac( b
);
2746 bExp
= extractFloat64Exp( b
);
2747 bSign
= extractFloat64Sign( b
);
2748 zSign
= aSign
^ bSign
;
2749 if ( aExp
== 0x7FF ) {
2750 if ( aSig
) return propagateFloat64NaN( a
, b
);
2751 if ( bExp
== 0x7FF ) {
2752 if ( bSig
) return propagateFloat64NaN( a
, b
);
2753 float_raise( float_flag_invalid
);
2754 return float64_default_nan
;
2756 return packFloat64( zSign
, 0x7FF, 0 );
2758 if ( bExp
== 0x7FF ) {
2759 if ( bSig
) return propagateFloat64NaN( a
, b
);
2760 return packFloat64( zSign
, 0, 0 );
2764 if ( ( aExp
| aSig
) == 0 ) {
2765 float_raise( float_flag_invalid
);
2766 return float64_default_nan
;
2768 float_raise( float_flag_divbyzero
);
2769 return packFloat64( zSign
, 0x7FF, 0 );
2771 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2774 if ( aSig
== 0 ) return packFloat64( zSign
, 0, 0 );
2775 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2777 zExp
= aExp
- bExp
+ 0x3FD;
2778 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<10;
2779 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2780 if ( bSig
<= ( aSig
+ aSig
) ) {
2784 zSig
= estimateDiv128To64( aSig
, 0, bSig
);
2785 if ( ( zSig
& 0x1FF ) <= 2 ) {
2786 mul64To128( bSig
, zSig
, &term0
, &term1
);
2787 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2788 while ( (sbits64
) rem0
< 0 ) {
2790 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
2792 zSig
|= ( rem1
!= 0 );
2794 return roundAndPackFloat64( zSign
, zExp
, zSig
);
2798 /*----------------------------------------------------------------------------
2799 | Returns the remainder of the double-precision floating-point value `a'
2800 | with respect to the corresponding value `b'. The operation is performed
2801 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2802 *----------------------------------------------------------------------------*/
2804 float64
float64_rem( float64 a
, float64 b
)
2806 flag aSign
, bSign
, zSign
;
2807 int16 aExp
, bExp
, expDiff
;
2809 bits64 q
, alternateASig
;
2812 aSig
= extractFloat64Frac( a
);
2813 aExp
= extractFloat64Exp( a
);
2814 aSign
= extractFloat64Sign( a
);
2815 bSig
= extractFloat64Frac( b
);
2816 bExp
= extractFloat64Exp( b
);
2817 bSign
= extractFloat64Sign( b
);
2818 if ( aExp
== 0x7FF ) {
2819 if ( aSig
|| ( ( bExp
== 0x7FF ) && bSig
) ) {
2820 return propagateFloat64NaN( a
, b
);
2822 float_raise( float_flag_invalid
);
2823 return float64_default_nan
;
2825 if ( bExp
== 0x7FF ) {
2826 if ( bSig
) return propagateFloat64NaN( a
, b
);
2831 float_raise( float_flag_invalid
);
2832 return float64_default_nan
;
2834 normalizeFloat64Subnormal( bSig
, &bExp
, &bSig
);
2837 if ( aSig
== 0 ) return a
;
2838 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2840 expDiff
= aExp
- bExp
;
2841 aSig
= ( aSig
| LIT64( 0x0010000000000000 ) )<<11;
2842 bSig
= ( bSig
| LIT64( 0x0010000000000000 ) )<<11;
2843 if ( expDiff
< 0 ) {
2844 if ( expDiff
< -1 ) return a
;
2847 q
= ( bSig
<= aSig
);
2848 if ( q
) aSig
-= bSig
;
2850 while ( 0 < expDiff
) {
2851 q
= estimateDiv128To64( aSig
, 0, bSig
);
2852 q
= ( 2 < q
) ? q
- 2 : 0;
2853 aSig
= - ( ( bSig
>>2 ) * q
);
2857 if ( 0 < expDiff
) {
2858 q
= estimateDiv128To64( aSig
, 0, bSig
);
2859 q
= ( 2 < q
) ? q
- 2 : 0;
2862 aSig
= ( ( aSig
>>1 )<<( expDiff
- 1 ) ) - bSig
* q
;
2869 alternateASig
= aSig
;
2872 } while ( 0 <= (sbits64
) aSig
);
2873 sigMean
= aSig
+ alternateASig
;
2874 if ( ( sigMean
< 0 ) || ( ( sigMean
== 0 ) && ( q
& 1 ) ) ) {
2875 aSig
= alternateASig
;
2877 zSign
= ( (sbits64
) aSig
< 0 );
2878 if ( zSign
) aSig
= - aSig
;
2879 return normalizeRoundAndPackFloat64( aSign
^ zSign
, bExp
, aSig
);
2883 /*----------------------------------------------------------------------------
2884 | Returns the square root of the double-precision floating-point value `a'.
2885 | The operation is performed according to the IEC/IEEE Standard for Binary
2886 | Floating-Point Arithmetic.
2887 *----------------------------------------------------------------------------*/
2889 float64
float64_sqrt( float64 a
)
2893 bits64 aSig
, zSig
, doubleZSig
;
2894 bits64 rem0
, rem1
, term0
, term1
;
2897 aSig
= extractFloat64Frac( a
);
2898 aExp
= extractFloat64Exp( a
);
2899 aSign
= extractFloat64Sign( a
);
2900 if ( aExp
== 0x7FF ) {
2901 if ( aSig
) return propagateFloat64NaN( a
, a
);
2902 if ( ! aSign
) return a
;
2903 float_raise( float_flag_invalid
);
2904 return float64_default_nan
;
2907 if ( ( aExp
| aSig
) == 0 ) return a
;
2908 float_raise( float_flag_invalid
);
2909 return float64_default_nan
;
2912 if ( aSig
== 0 ) return 0;
2913 normalizeFloat64Subnormal( aSig
, &aExp
, &aSig
);
2915 zExp
= ( ( aExp
- 0x3FF )>>1 ) + 0x3FE;
2916 aSig
|= LIT64( 0x0010000000000000 );
2917 zSig
= estimateSqrt32( aExp
, aSig
>>21 );
2918 aSig
<<= 9 - ( aExp
& 1 );
2919 zSig
= estimateDiv128To64( aSig
, 0, zSig
<<32 ) + ( zSig
<<30 );
2920 if ( ( zSig
& 0x1FF ) <= 5 ) {
2921 doubleZSig
= zSig
<<1;
2922 mul64To128( zSig
, zSig
, &term0
, &term1
);
2923 sub128( aSig
, 0, term0
, term1
, &rem0
, &rem1
);
2924 while ( (sbits64
) rem0
< 0 ) {
2927 add128( rem0
, rem1
, zSig
>>63, doubleZSig
| 1, &rem0
, &rem1
);
2929 zSig
|= ( ( rem0
| rem1
) != 0 );
2931 return roundAndPackFloat64( 0, zExp
, zSig
);
2935 /*----------------------------------------------------------------------------
2936 | Returns 1 if the double-precision floating-point value `a' is equal to the
2937 | corresponding value `b', and 0 otherwise. The comparison is performed
2938 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2939 *----------------------------------------------------------------------------*/
2941 flag
float64_eq( float64 a
, float64 b
)
2944 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2945 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2947 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
2948 float_raise( float_flag_invalid
);
2952 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2956 /*----------------------------------------------------------------------------
2957 | Returns 1 if the double-precision floating-point value `a' is less than or
2958 | equal to the corresponding value `b', and 0 otherwise. The comparison is
2959 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2961 *----------------------------------------------------------------------------*/
2963 flag
float64_le( float64 a
, float64 b
)
2967 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2968 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2970 float_raise( float_flag_invalid
);
2973 aSign
= extractFloat64Sign( a
);
2974 bSign
= extractFloat64Sign( b
);
2975 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
2976 return ( a
== b
) || ( aSign
^ ( a
< b
) );
2980 /*----------------------------------------------------------------------------
2981 | Returns 1 if the double-precision floating-point value `a' is less than
2982 | the corresponding value `b', and 0 otherwise. The comparison is performed
2983 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2984 *----------------------------------------------------------------------------*/
2986 flag
float64_lt( float64 a
, float64 b
)
2990 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
2991 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
2993 float_raise( float_flag_invalid
);
2996 aSign
= extractFloat64Sign( a
);
2997 bSign
= extractFloat64Sign( b
);
2998 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
2999 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3003 /*----------------------------------------------------------------------------
3004 | Returns 1 if the double-precision floating-point value `a' is equal to the
3005 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3006 | if either operand is a NaN. Otherwise, the comparison is performed
3007 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3008 *----------------------------------------------------------------------------*/
3010 flag
float64_eq_signaling( float64 a
, float64 b
)
3013 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3014 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3016 float_raise( float_flag_invalid
);
3019 return ( a
== b
) || ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3023 /*----------------------------------------------------------------------------
3024 | Returns 1 if the double-precision floating-point value `a' is less than or
3025 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3026 | cause an exception. Otherwise, the comparison is performed according to the
3027 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3028 *----------------------------------------------------------------------------*/
3030 flag
float64_le_quiet( float64 a
, float64 b
)
3035 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3036 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3038 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3039 float_raise( float_flag_invalid
);
3043 aSign
= extractFloat64Sign( a
);
3044 bSign
= extractFloat64Sign( b
);
3045 if ( aSign
!= bSign
) return aSign
|| ( (bits64
) ( ( a
| b
)<<1 ) == 0 );
3046 return ( a
== b
) || ( aSign
^ ( a
< b
) );
3050 /*----------------------------------------------------------------------------
3051 | Returns 1 if the double-precision floating-point value `a' is less than
3052 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3053 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3054 | Standard for Binary Floating-Point Arithmetic.
3055 *----------------------------------------------------------------------------*/
3057 flag
float64_lt_quiet( float64 a
, float64 b
)
3061 if ( ( ( extractFloat64Exp( a
) == 0x7FF ) && extractFloat64Frac( a
) )
3062 || ( ( extractFloat64Exp( b
) == 0x7FF ) && extractFloat64Frac( b
) )
3064 if ( float64_is_signaling_nan( a
) || float64_is_signaling_nan( b
) ) {
3065 float_raise( float_flag_invalid
);
3069 aSign
= extractFloat64Sign( a
);
3070 bSign
= extractFloat64Sign( b
);
3071 if ( aSign
!= bSign
) return aSign
&& ( (bits64
) ( ( a
| b
)<<1 ) != 0 );
3072 return ( a
!= b
) && ( aSign
^ ( a
< b
) );
3078 /*----------------------------------------------------------------------------
3079 | Returns the result of converting the extended double-precision floating-
3080 | point value `a' to the 32-bit two's complement integer format. The
3081 | conversion is performed according to the IEC/IEEE Standard for Binary
3082 | Floating-Point Arithmetic---which means in particular that the conversion
3083 | is rounded according to the current rounding mode. If `a' is a NaN, the
3084 | largest positive integer is returned. Otherwise, if the conversion
3085 | overflows, the largest integer with the same sign as `a' is returned.
3086 *----------------------------------------------------------------------------*/
3088 int32
floatx80_to_int32( floatx80 a
)
3091 int32 aExp
, shiftCount
;
3094 aSig
= extractFloatx80Frac( a
);
3095 aExp
= extractFloatx80Exp( a
);
3096 aSign
= extractFloatx80Sign( a
);
3097 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3098 shiftCount
= 0x4037 - aExp
;
3099 if ( shiftCount
<= 0 ) shiftCount
= 1;
3100 shift64RightJamming( aSig
, shiftCount
, &aSig
);
3101 return roundAndPackInt32( aSign
, aSig
);
3105 /*----------------------------------------------------------------------------
3106 | Returns the result of converting the extended double-precision floating-
3107 | point value `a' to the 32-bit two's complement integer format. The
3108 | conversion is performed according to the IEC/IEEE Standard for Binary
3109 | Floating-Point Arithmetic, except that the conversion is always rounded
3110 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3111 | Otherwise, if the conversion overflows, the largest integer with the same
3112 | sign as `a' is returned.
3113 *----------------------------------------------------------------------------*/
3115 int32
floatx80_to_int32_round_to_zero( floatx80 a
)
3118 int32 aExp
, shiftCount
;
3119 bits64 aSig
, savedASig
;
3122 aSig
= extractFloatx80Frac( a
);
3123 aExp
= extractFloatx80Exp( a
);
3124 aSign
= extractFloatx80Sign( a
);
3125 if ( 0x401E < aExp
) {
3126 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) aSign
= 0;
3129 else if ( aExp
< 0x3FFF ) {
3130 if ( aExp
|| aSig
) float_exception_flags
|= float_flag_inexact
;
3133 shiftCount
= 0x403E - aExp
;
3135 aSig
>>= shiftCount
;
3137 if ( aSign
) z
= - z
;
3138 if ( ( z
< 0 ) ^ aSign
) {
3140 float_raise( float_flag_invalid
);
3141 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
3143 if ( ( aSig
<<shiftCount
) != savedASig
) {
3144 float_exception_flags
|= float_flag_inexact
;
3150 /*----------------------------------------------------------------------------
3151 | Returns the result of converting the extended double-precision floating-
3152 | point value `a' to the 64-bit two's complement integer format. The
3153 | conversion is performed according to the IEC/IEEE Standard for Binary
3154 | Floating-Point Arithmetic---which means in particular that the conversion
3155 | is rounded according to the current rounding mode. If `a' is a NaN,
3156 | the largest positive integer is returned. Otherwise, if the conversion
3157 | overflows, the largest integer with the same sign as `a' is returned.
3158 *----------------------------------------------------------------------------*/
3160 int64
floatx80_to_int64( floatx80 a
)
3163 int32 aExp
, shiftCount
;
3164 bits64 aSig
, aSigExtra
;
3166 aSig
= extractFloatx80Frac( a
);
3167 aExp
= extractFloatx80Exp( a
);
3168 aSign
= extractFloatx80Sign( a
);
3169 shiftCount
= 0x403E - aExp
;
3170 if ( shiftCount
<= 0 ) {
3172 float_raise( float_flag_invalid
);
3174 || ( ( aExp
== 0x7FFF )
3175 && ( aSig
!= LIT64( 0x8000000000000000 ) ) )
3177 return LIT64( 0x7FFFFFFFFFFFFFFF );
3179 return (sbits64
) LIT64( 0x8000000000000000 );
3184 shift64ExtraRightJamming( aSig
, 0, shiftCount
, &aSig
, &aSigExtra
);
3186 return roundAndPackInt64( aSign
, aSig
, aSigExtra
);
3190 /*----------------------------------------------------------------------------
3191 | Returns the result of converting the extended double-precision floating-
3192 | point value `a' to the 64-bit two's complement integer format. The
3193 | conversion is performed according to the IEC/IEEE Standard for Binary
3194 | Floating-Point Arithmetic, except that the conversion is always rounded
3195 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3196 | Otherwise, if the conversion overflows, the largest integer with the same
3197 | sign as `a' is returned.
3198 *----------------------------------------------------------------------------*/
3200 int64
floatx80_to_int64_round_to_zero( floatx80 a
)
3203 int32 aExp
, shiftCount
;
3207 aSig
= extractFloatx80Frac( a
);
3208 aExp
= extractFloatx80Exp( a
);
3209 aSign
= extractFloatx80Sign( a
);
3210 shiftCount
= aExp
- 0x403E;
3211 if ( 0 <= shiftCount
) {
3212 aSig
&= LIT64( 0x7FFFFFFFFFFFFFFF );
3213 if ( ( a
.high
!= 0xC03E ) || aSig
) {
3214 float_raise( float_flag_invalid
);
3215 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && aSig
) ) {
3216 return LIT64( 0x7FFFFFFFFFFFFFFF );
3219 return (sbits64
) LIT64( 0x8000000000000000 );
3221 else if ( aExp
< 0x3FFF ) {
3222 if ( aExp
| aSig
) float_exception_flags
|= float_flag_inexact
;
3225 z
= aSig
>>( - shiftCount
);
3226 if ( (bits64
) ( aSig
<<( shiftCount
& 63 ) ) ) {
3227 float_exception_flags
|= float_flag_inexact
;
3229 if ( aSign
) z
= - z
;
3234 /*----------------------------------------------------------------------------
3235 | Returns the result of converting the extended double-precision floating-
3236 | point value `a' to the single-precision floating-point format. The
3237 | conversion is performed according to the IEC/IEEE Standard for Binary
3238 | Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3241 float32
floatx80_to_float32( floatx80 a
)
3247 aSig
= extractFloatx80Frac( a
);
3248 aExp
= extractFloatx80Exp( a
);
3249 aSign
= extractFloatx80Sign( a
);
3250 if ( aExp
== 0x7FFF ) {
3251 if ( (bits64
) ( aSig
<<1 ) ) {
3252 return commonNaNToFloat32( floatx80ToCommonNaN( a
) );
3254 return packFloat32( aSign
, 0xFF, 0 );
3256 shift64RightJamming( aSig
, 33, &aSig
);
3257 if ( aExp
|| aSig
) aExp
-= 0x3F81;
3258 return roundAndPackFloat32( aSign
, aExp
, aSig
);
3262 /*----------------------------------------------------------------------------
3263 | Returns the result of converting the extended double-precision floating-
3264 | point value `a' to the double-precision floating-point format. The
3265 | conversion is performed according to the IEC/IEEE Standard for Binary
3266 | Floating-Point Arithmetic.
3267 *----------------------------------------------------------------------------*/
3269 float64
floatx80_to_float64( floatx80 a
)
3275 aSig
= extractFloatx80Frac( a
);
3276 aExp
= extractFloatx80Exp( a
);
3277 aSign
= extractFloatx80Sign( a
);
3278 if ( aExp
== 0x7FFF ) {
3279 if ( (bits64
) ( aSig
<<1 ) ) {
3280 return commonNaNToFloat64( floatx80ToCommonNaN( a
) );
3282 return packFloat64( aSign
, 0x7FF, 0 );
3284 shift64RightJamming( aSig
, 1, &zSig
);
3285 if ( aExp
|| aSig
) aExp
-= 0x3C01;
3286 return roundAndPackFloat64( aSign
, aExp
, zSig
);
3292 /*----------------------------------------------------------------------------
3293 | Returns the result of converting the extended double-precision floating-
3294 | point value `a' to the quadruple-precision floating-point format. The
3295 | conversion is performed according to the IEC/IEEE Standard for Binary
3296 | Floating-Point Arithmetic.
3297 *----------------------------------------------------------------------------*/
3299 float128
floatx80_to_float128( floatx80 a
)
3303 bits64 aSig
, zSig0
, zSig1
;
3305 aSig
= extractFloatx80Frac( a
);
3306 aExp
= extractFloatx80Exp( a
);
3307 aSign
= extractFloatx80Sign( a
);
3308 if ( ( aExp
== 0x7FFF ) && (bits64
) ( aSig
<<1 ) ) {
3309 return commonNaNToFloat128( floatx80ToCommonNaN( a
) );
3311 shift128Right( aSig
<<1, 0, 16, &zSig0
, &zSig1
);
3312 return packFloat128( aSign
, aExp
, zSig0
, zSig1
);
3318 /*----------------------------------------------------------------------------
3319 | Rounds the extended double-precision floating-point value `a' to an integer,
3320 | and returns the result as an extended quadruple-precision floating-point
3321 | value. The operation is performed according to the IEC/IEEE Standard for
3322 | Binary Floating-Point Arithmetic.
3323 *----------------------------------------------------------------------------*/
3325 floatx80
floatx80_round_to_int( floatx80 a
)
3329 bits64 lastBitMask
, roundBitsMask
;
3333 aExp
= extractFloatx80Exp( a
);
3334 if ( 0x403E <= aExp
) {
3335 if ( ( aExp
== 0x7FFF ) && (bits64
) ( extractFloatx80Frac( a
)<<1 ) ) {
3336 return propagateFloatx80NaN( a
, a
);
3340 if ( aExp
< 0x3FFF ) {
3342 && ( (bits64
) ( extractFloatx80Frac( a
)<<1 ) == 0 ) ) {
3345 float_exception_flags
|= float_flag_inexact
;
3346 aSign
= extractFloatx80Sign( a
);
3347 switch ( float_rounding_mode
) {
3348 case float_round_nearest_even
:
3349 if ( ( aExp
== 0x3FFE ) && (bits64
) ( extractFloatx80Frac( a
)<<1 )
3352 packFloatx80( aSign
, 0x3FFF, LIT64( 0x8000000000000000 ) );
3355 case float_round_down
:
3358 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3359 : packFloatx80( 0, 0, 0 );
3360 case float_round_up
:
3362 aSign
? packFloatx80( 1, 0, 0 )
3363 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3365 return packFloatx80( aSign
, 0, 0 );
3368 lastBitMask
<<= 0x403E - aExp
;
3369 roundBitsMask
= lastBitMask
- 1;
3371 roundingMode
= float_rounding_mode
;
3372 if ( roundingMode
== float_round_nearest_even
) {
3373 z
.low
+= lastBitMask
>>1;
3374 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
3376 else if ( roundingMode
!= float_round_to_zero
) {
3377 if ( extractFloatx80Sign( z
) ^ ( roundingMode
== float_round_up
) ) {
3378 z
.low
+= roundBitsMask
;
3381 z
.low
&= ~ roundBitsMask
;
3384 z
.low
= LIT64( 0x8000000000000000 );
3386 if ( z
.low
!= a
.low
) float_exception_flags
|= float_flag_inexact
;
3391 /*----------------------------------------------------------------------------
3392 | Returns the result of adding the absolute values of the extended double-
3393 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3394 | negated before being returned. `zSign' is ignored if the result is a NaN.
3395 | The addition is performed according to the IEC/IEEE Standard for Binary
3396 | Floating-Point Arithmetic.
3397 *----------------------------------------------------------------------------*/
3399 static floatx80
addFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3401 int32 aExp
, bExp
, zExp
;
3402 bits64 aSig
, bSig
, zSig0
, zSig1
;
3405 aSig
= extractFloatx80Frac( a
);
3406 aExp
= extractFloatx80Exp( a
);
3407 bSig
= extractFloatx80Frac( b
);
3408 bExp
= extractFloatx80Exp( b
);
3409 expDiff
= aExp
- bExp
;
3410 if ( 0 < expDiff
) {
3411 if ( aExp
== 0x7FFF ) {
3412 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3415 if ( bExp
== 0 ) --expDiff
;
3416 shift64ExtraRightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3419 else if ( expDiff
< 0 ) {
3420 if ( bExp
== 0x7FFF ) {
3421 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3422 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3424 if ( aExp
== 0 ) ++expDiff
;
3425 shift64ExtraRightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3429 if ( aExp
== 0x7FFF ) {
3430 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3431 return propagateFloatx80NaN( a
, b
);
3436 zSig0
= aSig
+ bSig
;
3438 normalizeFloatx80Subnormal( zSig0
, &zExp
, &zSig0
);
3444 zSig0
= aSig
+ bSig
;
3445 if ( (sbits64
) zSig0
< 0 ) goto roundAndPack
;
3447 shift64ExtraRightJamming( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3448 zSig0
|= LIT64( 0x8000000000000000 );
3452 roundAndPackFloatx80(
3453 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3457 /*----------------------------------------------------------------------------
3458 | Returns the result of subtracting the absolute values of the extended
3459 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3460 | difference is negated before being returned. `zSign' is ignored if the
3461 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3462 | Standard for Binary Floating-Point Arithmetic.
3463 *----------------------------------------------------------------------------*/
3465 static floatx80
subFloatx80Sigs( floatx80 a
, floatx80 b
, flag zSign
)
3467 int32 aExp
, bExp
, zExp
;
3468 bits64 aSig
, bSig
, zSig0
, zSig1
;
3472 aSig
= extractFloatx80Frac( a
);
3473 aExp
= extractFloatx80Exp( a
);
3474 bSig
= extractFloatx80Frac( b
);
3475 bExp
= extractFloatx80Exp( b
);
3476 expDiff
= aExp
- bExp
;
3477 if ( 0 < expDiff
) goto aExpBigger
;
3478 if ( expDiff
< 0 ) goto bExpBigger
;
3479 if ( aExp
== 0x7FFF ) {
3480 if ( (bits64
) ( ( aSig
| bSig
)<<1 ) ) {
3481 return propagateFloatx80NaN( a
, b
);
3483 float_raise( float_flag_invalid
);
3484 z
.low
= floatx80_default_nan_low
;
3485 z
.high
= floatx80_default_nan_high
;
3493 if ( bSig
< aSig
) goto aBigger
;
3494 if ( aSig
< bSig
) goto bBigger
;
3495 return packFloatx80( float_rounding_mode
== float_round_down
, 0, 0 );
3497 if ( bExp
== 0x7FFF ) {
3498 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3499 return packFloatx80( zSign
^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3501 if ( aExp
== 0 ) ++expDiff
;
3502 shift128RightJamming( aSig
, 0, - expDiff
, &aSig
, &zSig1
);
3504 sub128( bSig
, 0, aSig
, zSig1
, &zSig0
, &zSig1
);
3507 goto normalizeRoundAndPack
;
3509 if ( aExp
== 0x7FFF ) {
3510 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3513 if ( bExp
== 0 ) --expDiff
;
3514 shift128RightJamming( bSig
, 0, expDiff
, &bSig
, &zSig1
);
3516 sub128( aSig
, 0, bSig
, zSig1
, &zSig0
, &zSig1
);
3518 normalizeRoundAndPack
:
3520 normalizeRoundAndPackFloatx80(
3521 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3525 /*----------------------------------------------------------------------------
3526 | Returns the result of adding the extended double-precision floating-point
3527 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3528 | Standard for Binary Floating-Point Arithmetic.
3529 *----------------------------------------------------------------------------*/
3531 floatx80
floatx80_add( floatx80 a
, floatx80 b
)
3535 aSign
= extractFloatx80Sign( a
);
3536 bSign
= extractFloatx80Sign( b
);
3537 if ( aSign
== bSign
) {
3538 return addFloatx80Sigs( a
, b
, aSign
);
3541 return subFloatx80Sigs( a
, b
, aSign
);
3546 /*----------------------------------------------------------------------------
3547 | Returns the result of subtracting the extended double-precision floating-
3548 | point values `a' and `b'. The operation is performed according to the
3549 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3550 *----------------------------------------------------------------------------*/
3552 floatx80
floatx80_sub( floatx80 a
, floatx80 b
)
3556 aSign
= extractFloatx80Sign( a
);
3557 bSign
= extractFloatx80Sign( b
);
3558 if ( aSign
== bSign
) {
3559 return subFloatx80Sigs( a
, b
, aSign
);
3562 return addFloatx80Sigs( a
, b
, aSign
);
3567 /*----------------------------------------------------------------------------
3568 | Returns the result of multiplying the extended double-precision floating-
3569 | point values `a' and `b'. The operation is performed according to the
3570 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571 *----------------------------------------------------------------------------*/
3573 floatx80
floatx80_mul( floatx80 a
, floatx80 b
)
3575 flag aSign
, bSign
, zSign
;
3576 int32 aExp
, bExp
, zExp
;
3577 bits64 aSig
, bSig
, zSig0
, zSig1
;
3580 aSig
= extractFloatx80Frac( a
);
3581 aExp
= extractFloatx80Exp( a
);
3582 aSign
= extractFloatx80Sign( a
);
3583 bSig
= extractFloatx80Frac( b
);
3584 bExp
= extractFloatx80Exp( b
);
3585 bSign
= extractFloatx80Sign( b
);
3586 zSign
= aSign
^ bSign
;
3587 if ( aExp
== 0x7FFF ) {
3588 if ( (bits64
) ( aSig
<<1 )
3589 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3590 return propagateFloatx80NaN( a
, b
);
3592 if ( ( bExp
| bSig
) == 0 ) goto invalid
;
3593 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3595 if ( bExp
== 0x7FFF ) {
3596 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3597 if ( ( aExp
| aSig
) == 0 ) {
3599 float_raise( float_flag_invalid
);
3600 z
.low
= floatx80_default_nan_low
;
3601 z
.high
= floatx80_default_nan_high
;
3604 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3607 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3608 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3611 if ( bSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3612 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3614 zExp
= aExp
+ bExp
- 0x3FFE;
3615 mul64To128( aSig
, bSig
, &zSig0
, &zSig1
);
3616 if ( 0 < (sbits64
) zSig0
) {
3617 shortShift128Left( zSig0
, zSig1
, 1, &zSig0
, &zSig1
);
3621 roundAndPackFloatx80(
3622 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3626 /*----------------------------------------------------------------------------
3627 | Returns the result of dividing the extended double-precision floating-point
3628 | value `a' by the corresponding value `b'. The operation is performed
3629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3630 *----------------------------------------------------------------------------*/
3632 floatx80
floatx80_div( floatx80 a
, floatx80 b
)
3634 flag aSign
, bSign
, zSign
;
3635 int32 aExp
, bExp
, zExp
;
3636 bits64 aSig
, bSig
, zSig0
, zSig1
;
3637 bits64 rem0
, rem1
, rem2
, term0
, term1
, term2
;
3640 aSig
= extractFloatx80Frac( a
);
3641 aExp
= extractFloatx80Exp( a
);
3642 aSign
= extractFloatx80Sign( a
);
3643 bSig
= extractFloatx80Frac( b
);
3644 bExp
= extractFloatx80Exp( b
);
3645 bSign
= extractFloatx80Sign( b
);
3646 zSign
= aSign
^ bSign
;
3647 if ( aExp
== 0x7FFF ) {
3648 if ( (bits64
) ( aSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3649 if ( bExp
== 0x7FFF ) {
3650 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3653 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3655 if ( bExp
== 0x7FFF ) {
3656 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3657 return packFloatx80( zSign
, 0, 0 );
3661 if ( ( aExp
| aSig
) == 0 ) {
3663 float_raise( float_flag_invalid
);
3664 z
.low
= floatx80_default_nan_low
;
3665 z
.high
= floatx80_default_nan_high
;
3668 float_raise( float_flag_divbyzero
);
3669 return packFloatx80( zSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
3671 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3674 if ( aSig
== 0 ) return packFloatx80( zSign
, 0, 0 );
3675 normalizeFloatx80Subnormal( aSig
, &aExp
, &aSig
);
3677 zExp
= aExp
- bExp
+ 0x3FFE;
3679 if ( bSig
<= aSig
) {
3680 shift128Right( aSig
, 0, 1, &aSig
, &rem1
);
3683 zSig0
= estimateDiv128To64( aSig
, rem1
, bSig
);
3684 mul64To128( bSig
, zSig0
, &term0
, &term1
);
3685 sub128( aSig
, rem1
, term0
, term1
, &rem0
, &rem1
);
3686 while ( (sbits64
) rem0
< 0 ) {
3688 add128( rem0
, rem1
, 0, bSig
, &rem0
, &rem1
);
3690 zSig1
= estimateDiv128To64( rem1
, 0, bSig
);
3691 if ( (bits64
) ( zSig1
<<1 ) <= 8 ) {
3692 mul64To128( bSig
, zSig1
, &term1
, &term2
);
3693 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3694 while ( (sbits64
) rem1
< 0 ) {
3696 add128( rem1
, rem2
, 0, bSig
, &rem1
, &rem2
);
3698 zSig1
|= ( ( rem1
| rem2
) != 0 );
3701 roundAndPackFloatx80(
3702 floatx80_rounding_precision
, zSign
, zExp
, zSig0
, zSig1
);
3706 /*----------------------------------------------------------------------------
3707 | Returns the remainder of the extended double-precision floating-point value
3708 | `a' with respect to the corresponding value `b'. The operation is performed
3709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3710 *----------------------------------------------------------------------------*/
3712 floatx80
floatx80_rem( floatx80 a
, floatx80 b
)
3714 flag aSign
, bSign
, zSign
;
3715 int32 aExp
, bExp
, expDiff
;
3716 bits64 aSig0
, aSig1
, bSig
;
3717 bits64 q
, term0
, term1
, alternateASig0
, alternateASig1
;
3720 aSig0
= extractFloatx80Frac( a
);
3721 aExp
= extractFloatx80Exp( a
);
3722 aSign
= extractFloatx80Sign( a
);
3723 bSig
= extractFloatx80Frac( b
);
3724 bExp
= extractFloatx80Exp( b
);
3725 bSign
= extractFloatx80Sign( b
);
3726 if ( aExp
== 0x7FFF ) {
3727 if ( (bits64
) ( aSig0
<<1 )
3728 || ( ( bExp
== 0x7FFF ) && (bits64
) ( bSig
<<1 ) ) ) {
3729 return propagateFloatx80NaN( a
, b
);
3733 if ( bExp
== 0x7FFF ) {
3734 if ( (bits64
) ( bSig
<<1 ) ) return propagateFloatx80NaN( a
, b
);
3740 float_raise( float_flag_invalid
);
3741 z
.low
= floatx80_default_nan_low
;
3742 z
.high
= floatx80_default_nan_high
;
3745 normalizeFloatx80Subnormal( bSig
, &bExp
, &bSig
);
3748 if ( (bits64
) ( aSig0
<<1 ) == 0 ) return a
;
3749 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3751 bSig
|= LIT64( 0x8000000000000000 );
3753 expDiff
= aExp
- bExp
;
3755 if ( expDiff
< 0 ) {
3756 if ( expDiff
< -1 ) return a
;
3757 shift128Right( aSig0
, 0, 1, &aSig0
, &aSig1
);
3760 q
= ( bSig
<= aSig0
);
3761 if ( q
) aSig0
-= bSig
;
3763 while ( 0 < expDiff
) {
3764 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3765 q
= ( 2 < q
) ? q
- 2 : 0;
3766 mul64To128( bSig
, q
, &term0
, &term1
);
3767 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3768 shortShift128Left( aSig0
, aSig1
, 62, &aSig0
, &aSig1
);
3772 if ( 0 < expDiff
) {
3773 q
= estimateDiv128To64( aSig0
, aSig1
, bSig
);
3774 q
= ( 2 < q
) ? q
- 2 : 0;
3776 mul64To128( bSig
, q
<<( 64 - expDiff
), &term0
, &term1
);
3777 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3778 shortShift128Left( 0, bSig
, 64 - expDiff
, &term0
, &term1
);
3779 while ( le128( term0
, term1
, aSig0
, aSig1
) ) {
3781 sub128( aSig0
, aSig1
, term0
, term1
, &aSig0
, &aSig1
);
3788 sub128( term0
, term1
, aSig0
, aSig1
, &alternateASig0
, &alternateASig1
);
3789 if ( lt128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3790 || ( eq128( alternateASig0
, alternateASig1
, aSig0
, aSig1
)
3793 aSig0
= alternateASig0
;
3794 aSig1
= alternateASig1
;
3798 normalizeRoundAndPackFloatx80(
3799 80, zSign
, bExp
+ expDiff
, aSig0
, aSig1
);
3803 /*----------------------------------------------------------------------------
3804 | Returns the square root of the extended double-precision floating-point
3805 | value `a'. The operation is performed according to the IEC/IEEE Standard
3806 | for Binary Floating-Point Arithmetic.
3807 *----------------------------------------------------------------------------*/
3809 floatx80
floatx80_sqrt( floatx80 a
)
3813 bits64 aSig0
, aSig1
, zSig0
, zSig1
, doubleZSig0
;
3814 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
3817 aSig0
= extractFloatx80Frac( a
);
3818 aExp
= extractFloatx80Exp( a
);
3819 aSign
= extractFloatx80Sign( a
);
3820 if ( aExp
== 0x7FFF ) {
3821 if ( (bits64
) ( aSig0
<<1 ) ) return propagateFloatx80NaN( a
, a
);
3822 if ( ! aSign
) return a
;
3826 if ( ( aExp
| aSig0
) == 0 ) return a
;
3828 float_raise( float_flag_invalid
);
3829 z
.low
= floatx80_default_nan_low
;
3830 z
.high
= floatx80_default_nan_high
;
3834 if ( aSig0
== 0 ) return packFloatx80( 0, 0, 0 );
3835 normalizeFloatx80Subnormal( aSig0
, &aExp
, &aSig0
);
3837 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFF;
3838 zSig0
= estimateSqrt32( aExp
, aSig0
>>32 );
3839 shift128Right( aSig0
, 0, 2 + ( aExp
& 1 ), &aSig0
, &aSig1
);
3840 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
3841 doubleZSig0
= zSig0
<<1;
3842 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
3843 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
3844 while ( (sbits64
) rem0
< 0 ) {
3847 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
3849 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
3850 if ( ( zSig1
& LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3851 if ( zSig1
== 0 ) zSig1
= 1;
3852 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
3853 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
3854 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
3855 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3856 while ( (sbits64
) rem1
< 0 ) {
3858 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
3860 term2
|= doubleZSig0
;
3861 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
3863 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
3865 shortShift128Left( 0, zSig1
, 1, &zSig0
, &zSig1
);
3866 zSig0
|= doubleZSig0
;
3868 roundAndPackFloatx80(
3869 floatx80_rounding_precision
, 0, zExp
, zSig0
, zSig1
);
3873 /*----------------------------------------------------------------------------
3874 | Returns 1 if the extended double-precision floating-point value `a' is
3875 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3876 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3878 *----------------------------------------------------------------------------*/
3880 flag
floatx80_eq( floatx80 a
, floatx80 b
)
3883 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3884 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3885 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3886 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3888 if ( floatx80_is_signaling_nan( a
)
3889 || floatx80_is_signaling_nan( b
) ) {
3890 float_raise( float_flag_invalid
);
3896 && ( ( a
.high
== b
.high
)
3898 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3903 /*----------------------------------------------------------------------------
3904 | Returns 1 if the extended double-precision floating-point value `a' is
3905 | less than or equal to the corresponding value `b', and 0 otherwise. The
3906 | comparison is performed according to the IEC/IEEE Standard for Binary
3907 | Floating-Point Arithmetic.
3908 *----------------------------------------------------------------------------*/
3910 flag
floatx80_le( floatx80 a
, floatx80 b
)
3914 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3915 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3916 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3917 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3919 float_raise( float_flag_invalid
);
3922 aSign
= extractFloatx80Sign( a
);
3923 bSign
= extractFloatx80Sign( b
);
3924 if ( aSign
!= bSign
) {
3927 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3931 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
3932 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
3936 /*----------------------------------------------------------------------------
3937 | Returns 1 if the extended double-precision floating-point value `a' is
3938 | less than the corresponding value `b', and 0 otherwise. The comparison
3939 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3941 *----------------------------------------------------------------------------*/
3943 flag
floatx80_lt( floatx80 a
, floatx80 b
)
3947 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3948 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3949 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3950 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3952 float_raise( float_flag_invalid
);
3955 aSign
= extractFloatx80Sign( a
);
3956 bSign
= extractFloatx80Sign( b
);
3957 if ( aSign
!= bSign
) {
3960 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
3964 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
3965 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
3969 /*----------------------------------------------------------------------------
3970 | Returns 1 if the extended double-precision floating-point value `a' is equal
3971 | to the corresponding value `b', and 0 otherwise. The invalid exception is
3972 | raised if either operand is a NaN. Otherwise, the comparison is performed
3973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974 *----------------------------------------------------------------------------*/
3976 flag
floatx80_eq_signaling( floatx80 a
, floatx80 b
)
3979 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
3980 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
3981 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
3982 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
3984 float_raise( float_flag_invalid
);
3989 && ( ( a
.high
== b
.high
)
3991 && ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
3996 /*----------------------------------------------------------------------------
3997 | Returns 1 if the extended double-precision floating-point value `a' is less
3998 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3999 | do not cause an exception. Otherwise, the comparison is performed according
4000 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4001 *----------------------------------------------------------------------------*/
4003 flag
floatx80_le_quiet( floatx80 a
, floatx80 b
)
4007 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4008 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4009 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4010 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4012 if ( floatx80_is_signaling_nan( a
)
4013 || floatx80_is_signaling_nan( b
) ) {
4014 float_raise( float_flag_invalid
);
4018 aSign
= extractFloatx80Sign( a
);
4019 bSign
= extractFloatx80Sign( b
);
4020 if ( aSign
!= bSign
) {
4023 || ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4027 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
4028 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
4032 /*----------------------------------------------------------------------------
4033 | Returns 1 if the extended double-precision floating-point value `a' is less
4034 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4035 | an exception. Otherwise, the comparison is performed according to the
4036 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4037 *----------------------------------------------------------------------------*/
4039 flag
floatx80_lt_quiet( floatx80 a
, floatx80 b
)
4043 if ( ( ( extractFloatx80Exp( a
) == 0x7FFF )
4044 && (bits64
) ( extractFloatx80Frac( a
)<<1 ) )
4045 || ( ( extractFloatx80Exp( b
) == 0x7FFF )
4046 && (bits64
) ( extractFloatx80Frac( b
)<<1 ) )
4048 if ( floatx80_is_signaling_nan( a
)
4049 || floatx80_is_signaling_nan( b
) ) {
4050 float_raise( float_flag_invalid
);
4054 aSign
= extractFloatx80Sign( a
);
4055 bSign
= extractFloatx80Sign( b
);
4056 if ( aSign
!= bSign
) {
4059 && ( ( ( (bits16
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
4063 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
4064 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the quadruple-precision floating-point
4074 | value `a' to the 32-bit two's complement integer format. The conversion
4075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4076 | Arithmetic---which means in particular that the conversion is rounded
4077 | according to the current rounding mode. If `a' is a NaN, the largest
4078 | positive integer is returned. Otherwise, if the conversion overflows, the
4079 | largest integer with the same sign as `a' is returned.
4080 *----------------------------------------------------------------------------*/
4082 int32
float128_to_int32( float128 a
)
4085 int32 aExp
, shiftCount
;
4086 bits64 aSig0
, aSig1
;
4088 aSig1
= extractFloat128Frac1( a
);
4089 aSig0
= extractFloat128Frac0( a
);
4090 aExp
= extractFloat128Exp( a
);
4091 aSign
= extractFloat128Sign( a
);
4092 if ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) aSign
= 0;
4093 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4094 aSig0
|= ( aSig1
!= 0 );
4095 shiftCount
= 0x4028 - aExp
;
4096 if ( 0 < shiftCount
) shift64RightJamming( aSig0
, shiftCount
, &aSig0
);
4097 return roundAndPackInt32( aSign
, aSig0
);
4101 /*----------------------------------------------------------------------------
4102 | Returns the result of converting the quadruple-precision floating-point
4103 | value `a' to the 32-bit two's complement integer format. The conversion
4104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4105 | Arithmetic, except that the conversion is always rounded toward zero. If
4106 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4107 | conversion overflows, the largest integer with the same sign as `a' is
4109 *----------------------------------------------------------------------------*/
4111 int32
float128_to_int32_round_to_zero( float128 a
)
4114 int32 aExp
, shiftCount
;
4115 bits64 aSig0
, aSig1
, savedASig
;
4118 aSig1
= extractFloat128Frac1( a
);
4119 aSig0
= extractFloat128Frac0( a
);
4120 aExp
= extractFloat128Exp( a
);
4121 aSign
= extractFloat128Sign( a
);
4122 aSig0
|= ( aSig1
!= 0 );
4123 if ( 0x401E < aExp
) {
4124 if ( ( aExp
== 0x7FFF ) && aSig0
) aSign
= 0;
4127 else if ( aExp
< 0x3FFF ) {
4128 if ( aExp
|| aSig0
) float_exception_flags
|= float_flag_inexact
;
4131 aSig0
|= LIT64( 0x0001000000000000 );
4132 shiftCount
= 0x402F - aExp
;
4134 aSig0
>>= shiftCount
;
4136 if ( aSign
) z
= - z
;
4137 if ( ( z
< 0 ) ^ aSign
) {
4139 float_raise( float_flag_invalid
);
4140 return aSign
? (sbits32
) 0x80000000 : 0x7FFFFFFF;
4142 if ( ( aSig0
<<shiftCount
) != savedASig
) {
4143 float_exception_flags
|= float_flag_inexact
;
4149 /*----------------------------------------------------------------------------
4150 | Returns the result of converting the quadruple-precision floating-point
4151 | value `a' to the 64-bit two's complement integer format. The conversion
4152 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4153 | Arithmetic---which means in particular that the conversion is rounded
4154 | according to the current rounding mode. If `a' is a NaN, the largest
4155 | positive integer is returned. Otherwise, if the conversion overflows, the
4156 | largest integer with the same sign as `a' is returned.
4157 *----------------------------------------------------------------------------*/
4159 int64
float128_to_int64( float128 a
)
4162 int32 aExp
, shiftCount
;
4163 bits64 aSig0
, aSig1
;
4165 aSig1
= extractFloat128Frac1( a
);
4166 aSig0
= extractFloat128Frac0( a
);
4167 aExp
= extractFloat128Exp( a
);
4168 aSign
= extractFloat128Sign( a
);
4169 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4170 shiftCount
= 0x402F - aExp
;
4171 if ( shiftCount
<= 0 ) {
4172 if ( 0x403E < aExp
) {
4173 float_raise( float_flag_invalid
);
4175 || ( ( aExp
== 0x7FFF )
4176 && ( aSig1
|| ( aSig0
!= LIT64( 0x0001000000000000 ) ) )
4179 return LIT64( 0x7FFFFFFFFFFFFFFF );
4181 return (sbits64
) LIT64( 0x8000000000000000 );
4183 shortShift128Left( aSig0
, aSig1
, - shiftCount
, &aSig0
, &aSig1
);
4186 shift64ExtraRightJamming( aSig0
, aSig1
, shiftCount
, &aSig0
, &aSig1
);
4188 return roundAndPackInt64( aSign
, aSig0
, aSig1
);
4192 /*----------------------------------------------------------------------------
4193 | Returns the result of converting the quadruple-precision floating-point
4194 | value `a' to the 64-bit two's complement integer format. The conversion
4195 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4196 | Arithmetic, except that the conversion is always rounded toward zero.
4197 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4198 | the conversion overflows, the largest integer with the same sign as `a' is
4200 *----------------------------------------------------------------------------*/
4202 int64
float128_to_int64_round_to_zero( float128 a
)
4205 int32 aExp
, shiftCount
;
4206 bits64 aSig0
, aSig1
;
4209 aSig1
= extractFloat128Frac1( a
);
4210 aSig0
= extractFloat128Frac0( a
);
4211 aExp
= extractFloat128Exp( a
);
4212 aSign
= extractFloat128Sign( a
);
4213 if ( aExp
) aSig0
|= LIT64( 0x0001000000000000 );
4214 shiftCount
= aExp
- 0x402F;
4215 if ( 0 < shiftCount
) {
4216 if ( 0x403E <= aExp
) {
4217 aSig0
&= LIT64( 0x0000FFFFFFFFFFFF );
4218 if ( ( a
.high
== LIT64( 0xC03E000000000000 ) )
4219 && ( aSig1
< LIT64( 0x0002000000000000 ) ) ) {
4220 if ( aSig1
) float_exception_flags
|= float_flag_inexact
;
4223 float_raise( float_flag_invalid
);
4224 if ( ! aSign
|| ( ( aExp
== 0x7FFF ) && ( aSig0
| aSig1
) ) ) {
4225 return LIT64( 0x7FFFFFFFFFFFFFFF );
4228 return (sbits64
) LIT64( 0x8000000000000000 );
4230 z
= ( aSig0
<<shiftCount
) | ( aSig1
>>( ( - shiftCount
) & 63 ) );
4231 if ( (bits64
) ( aSig1
<<shiftCount
) ) {
4232 float_exception_flags
|= float_flag_inexact
;
4236 if ( aExp
< 0x3FFF ) {
4237 if ( aExp
| aSig0
| aSig1
) {
4238 float_exception_flags
|= float_flag_inexact
;
4242 z
= aSig0
>>( - shiftCount
);
4244 || ( shiftCount
&& (bits64
) ( aSig0
<<( shiftCount
& 63 ) ) ) ) {
4245 float_exception_flags
|= float_flag_inexact
;
4248 if ( aSign
) z
= - z
;
4253 /*----------------------------------------------------------------------------
4254 | Returns the result of converting the quadruple-precision floating-point
4255 | value `a' to the single-precision floating-point format. The conversion
4256 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4258 *----------------------------------------------------------------------------*/
4260 float32
float128_to_float32( float128 a
)
4264 bits64 aSig0
, aSig1
;
4267 aSig1
= extractFloat128Frac1( a
);
4268 aSig0
= extractFloat128Frac0( a
);
4269 aExp
= extractFloat128Exp( a
);
4270 aSign
= extractFloat128Sign( a
);
4271 if ( aExp
== 0x7FFF ) {
4272 if ( aSig0
| aSig1
) {
4273 return commonNaNToFloat32( float128ToCommonNaN( a
) );
4275 return packFloat32( aSign
, 0xFF, 0 );
4277 aSig0
|= ( aSig1
!= 0 );
4278 shift64RightJamming( aSig0
, 18, &aSig0
);
4280 if ( aExp
|| zSig
) {
4284 return roundAndPackFloat32( aSign
, aExp
, zSig
);
4288 /*----------------------------------------------------------------------------
4289 | Returns the result of converting the quadruple-precision floating-point
4290 | value `a' to the double-precision floating-point format. The conversion
4291 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4293 *----------------------------------------------------------------------------*/
4295 float64
float128_to_float64( float128 a
)
4299 bits64 aSig0
, aSig1
;
4301 aSig1
= extractFloat128Frac1( a
);
4302 aSig0
= extractFloat128Frac0( a
);
4303 aExp
= extractFloat128Exp( a
);
4304 aSign
= extractFloat128Sign( a
);
4305 if ( aExp
== 0x7FFF ) {
4306 if ( aSig0
| aSig1
) {
4307 return commonNaNToFloat64( float128ToCommonNaN( a
) );
4309 return packFloat64( aSign
, 0x7FF, 0 );
4311 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4312 aSig0
|= ( aSig1
!= 0 );
4313 if ( aExp
|| aSig0
) {
4314 aSig0
|= LIT64( 0x4000000000000000 );
4317 return roundAndPackFloat64( aSign
, aExp
, aSig0
);
4323 /*----------------------------------------------------------------------------
4324 | Returns the result of converting the quadruple-precision floating-point
4325 | value `a' to the extended double-precision floating-point format. The
4326 | conversion is performed according to the IEC/IEEE Standard for Binary
4327 | Floating-Point Arithmetic.
4328 *----------------------------------------------------------------------------*/
4330 floatx80
float128_to_floatx80( float128 a
)
4334 bits64 aSig0
, aSig1
;
4336 aSig1
= extractFloat128Frac1( a
);
4337 aSig0
= extractFloat128Frac0( a
);
4338 aExp
= extractFloat128Exp( a
);
4339 aSign
= extractFloat128Sign( a
);
4340 if ( aExp
== 0x7FFF ) {
4341 if ( aSig0
| aSig1
) {
4342 return commonNaNToFloatx80( float128ToCommonNaN( a
) );
4344 return packFloatx80( aSign
, 0x7FFF, LIT64( 0x8000000000000000 ) );
4347 if ( ( aSig0
| aSig1
) == 0 ) return packFloatx80( aSign
, 0, 0 );
4348 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4351 aSig0
|= LIT64( 0x0001000000000000 );
4353 shortShift128Left( aSig0
, aSig1
, 15, &aSig0
, &aSig1
);
4354 return roundAndPackFloatx80( 80, aSign
, aExp
, aSig0
, aSig1
);
4360 /*----------------------------------------------------------------------------
4361 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4362 | returns the result as a quadruple-precision floating-point value. The
4363 | operation is performed according to the IEC/IEEE Standard for Binary
4364 | Floating-Point Arithmetic.
4365 *----------------------------------------------------------------------------*/
4367 float128
float128_round_to_int( float128 a
)
4371 bits64 lastBitMask
, roundBitsMask
;
4375 aExp
= extractFloat128Exp( a
);
4376 if ( 0x402F <= aExp
) {
4377 if ( 0x406F <= aExp
) {
4378 if ( ( aExp
== 0x7FFF )
4379 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) )
4381 return propagateFloat128NaN( a
, a
);
4386 lastBitMask
= ( lastBitMask
<<( 0x406E - aExp
) )<<1;
4387 roundBitsMask
= lastBitMask
- 1;
4389 roundingMode
= float_rounding_mode
;
4390 if ( roundingMode
== float_round_nearest_even
) {
4391 if ( lastBitMask
) {
4392 add128( z
.high
, z
.low
, 0, lastBitMask
>>1, &z
.high
, &z
.low
);
4393 if ( ( z
.low
& roundBitsMask
) == 0 ) z
.low
&= ~ lastBitMask
;
4396 if ( (sbits64
) z
.low
< 0 ) {
4398 if ( (bits64
) ( z
.low
<<1 ) == 0 ) z
.high
&= ~1;
4402 else if ( roundingMode
!= float_round_to_zero
) {
4403 if ( extractFloat128Sign( z
)
4404 ^ ( roundingMode
== float_round_up
) ) {
4405 add128( z
.high
, z
.low
, 0, roundBitsMask
, &z
.high
, &z
.low
);
4408 z
.low
&= ~ roundBitsMask
;
4411 if ( aExp
< 0x3FFF ) {
4412 if ( ( ( (bits64
) ( a
.high
<<1 ) ) | a
.low
) == 0 ) return a
;
4413 float_exception_flags
|= float_flag_inexact
;
4414 aSign
= extractFloat128Sign( a
);
4415 switch ( float_rounding_mode
) {
4416 case float_round_nearest_even
:
4417 if ( ( aExp
== 0x3FFE )
4418 && ( extractFloat128Frac0( a
)
4419 | extractFloat128Frac1( a
) )
4421 return packFloat128( aSign
, 0x3FFF, 0, 0 );
4424 case float_round_down
:
4426 aSign
? packFloat128( 1, 0x3FFF, 0, 0 )
4427 : packFloat128( 0, 0, 0, 0 );
4428 case float_round_up
:
4430 aSign
? packFloat128( 1, 0, 0, 0 )
4431 : packFloat128( 0, 0x3FFF, 0, 0 );
4433 return packFloat128( aSign
, 0, 0, 0 );
4436 lastBitMask
<<= 0x402F - aExp
;
4437 roundBitsMask
= lastBitMask
- 1;
4440 roundingMode
= float_rounding_mode
;
4441 if ( roundingMode
== float_round_nearest_even
) {
4442 z
.high
+= lastBitMask
>>1;
4443 if ( ( ( z
.high
& roundBitsMask
) | a
.low
) == 0 ) {
4444 z
.high
&= ~ lastBitMask
;
4447 else if ( roundingMode
!= float_round_to_zero
) {
4448 if ( extractFloat128Sign( z
)
4449 ^ ( roundingMode
== float_round_up
) ) {
4450 z
.high
|= ( a
.low
!= 0 );
4451 z
.high
+= roundBitsMask
;
4454 z
.high
&= ~ roundBitsMask
;
4456 if ( ( z
.low
!= a
.low
) || ( z
.high
!= a
.high
) ) {
4457 float_exception_flags
|= float_flag_inexact
;
4463 /*----------------------------------------------------------------------------
4464 | Returns the result of adding the absolute values of the quadruple-precision
4465 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4466 | before being returned. `zSign' is ignored if the result is a NaN.
4467 | The addition is performed according to the IEC/IEEE Standard for Binary
4468 | Floating-Point Arithmetic.
4469 *----------------------------------------------------------------------------*/
4471 static float128
addFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4473 int32 aExp
, bExp
, zExp
;
4474 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4477 aSig1
= extractFloat128Frac1( a
);
4478 aSig0
= extractFloat128Frac0( a
);
4479 aExp
= extractFloat128Exp( a
);
4480 bSig1
= extractFloat128Frac1( b
);
4481 bSig0
= extractFloat128Frac0( b
);
4482 bExp
= extractFloat128Exp( b
);
4483 expDiff
= aExp
- bExp
;
4484 if ( 0 < expDiff
) {
4485 if ( aExp
== 0x7FFF ) {
4486 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4493 bSig0
|= LIT64( 0x0001000000000000 );
4495 shift128ExtraRightJamming(
4496 bSig0
, bSig1
, 0, expDiff
, &bSig0
, &bSig1
, &zSig2
);
4499 else if ( expDiff
< 0 ) {
4500 if ( bExp
== 0x7FFF ) {
4501 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4502 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4508 aSig0
|= LIT64( 0x0001000000000000 );
4510 shift128ExtraRightJamming(
4511 aSig0
, aSig1
, 0, - expDiff
, &aSig0
, &aSig1
, &zSig2
);
4515 if ( aExp
== 0x7FFF ) {
4516 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4517 return propagateFloat128NaN( a
, b
);
4521 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4522 if ( aExp
== 0 ) return packFloat128( zSign
, 0, zSig0
, zSig1
);
4524 zSig0
|= LIT64( 0x0002000000000000 );
4528 aSig0
|= LIT64( 0x0001000000000000 );
4529 add128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4531 if ( zSig0
< LIT64( 0x0002000000000000 ) ) goto roundAndPack
;
4534 shift128ExtraRightJamming(
4535 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4537 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4541 /*----------------------------------------------------------------------------
4542 | Returns the result of subtracting the absolute values of the quadruple-
4543 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4544 | difference is negated before being returned. `zSign' is ignored if the
4545 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4546 | Standard for Binary Floating-Point Arithmetic.
4547 *----------------------------------------------------------------------------*/
4549 static float128
subFloat128Sigs( float128 a
, float128 b
, flag zSign
)
4551 int32 aExp
, bExp
, zExp
;
4552 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
;
4556 aSig1
= extractFloat128Frac1( a
);
4557 aSig0
= extractFloat128Frac0( a
);
4558 aExp
= extractFloat128Exp( a
);
4559 bSig1
= extractFloat128Frac1( b
);
4560 bSig0
= extractFloat128Frac0( b
);
4561 bExp
= extractFloat128Exp( b
);
4562 expDiff
= aExp
- bExp
;
4563 shortShift128Left( aSig0
, aSig1
, 14, &aSig0
, &aSig1
);
4564 shortShift128Left( bSig0
, bSig1
, 14, &bSig0
, &bSig1
);
4565 if ( 0 < expDiff
) goto aExpBigger
;
4566 if ( expDiff
< 0 ) goto bExpBigger
;
4567 if ( aExp
== 0x7FFF ) {
4568 if ( aSig0
| aSig1
| bSig0
| bSig1
) {
4569 return propagateFloat128NaN( a
, b
);
4571 float_raise( float_flag_invalid
);
4572 z
.low
= float128_default_nan_low
;
4573 z
.high
= float128_default_nan_high
;
4580 if ( bSig0
< aSig0
) goto aBigger
;
4581 if ( aSig0
< bSig0
) goto bBigger
;
4582 if ( bSig1
< aSig1
) goto aBigger
;
4583 if ( aSig1
< bSig1
) goto bBigger
;
4584 return packFloat128( float_rounding_mode
== float_round_down
, 0, 0, 0 );
4586 if ( bExp
== 0x7FFF ) {
4587 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4588 return packFloat128( zSign
^ 1, 0x7FFF, 0, 0 );
4594 aSig0
|= LIT64( 0x4000000000000000 );
4596 shift128RightJamming( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4597 bSig0
|= LIT64( 0x4000000000000000 );
4599 sub128( bSig0
, bSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4602 goto normalizeRoundAndPack
;
4604 if ( aExp
== 0x7FFF ) {
4605 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4612 bSig0
|= LIT64( 0x4000000000000000 );
4614 shift128RightJamming( bSig0
, bSig1
, expDiff
, &bSig0
, &bSig1
);
4615 aSig0
|= LIT64( 0x4000000000000000 );
4617 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
);
4619 normalizeRoundAndPack
:
4621 return normalizeRoundAndPackFloat128( zSign
, zExp
- 14, zSig0
, zSig1
);
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of adding the quadruple-precision floating-point values
4627 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4628 | for Binary Floating-Point Arithmetic.
4629 *----------------------------------------------------------------------------*/
4631 float128
float128_add( float128 a
, float128 b
)
4635 aSign
= extractFloat128Sign( a
);
4636 bSign
= extractFloat128Sign( b
);
4637 if ( aSign
== bSign
) {
4638 return addFloat128Sigs( a
, b
, aSign
);
4641 return subFloat128Sigs( a
, b
, aSign
);
4646 /*----------------------------------------------------------------------------
4647 | Returns the result of subtracting the quadruple-precision floating-point
4648 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4649 | Standard for Binary Floating-Point Arithmetic.
4650 *----------------------------------------------------------------------------*/
4652 float128
float128_sub( float128 a
, float128 b
)
4656 aSign
= extractFloat128Sign( a
);
4657 bSign
= extractFloat128Sign( b
);
4658 if ( aSign
== bSign
) {
4659 return subFloat128Sigs( a
, b
, aSign
);
4662 return addFloat128Sigs( a
, b
, aSign
);
4667 /*----------------------------------------------------------------------------
4668 | Returns the result of multiplying the quadruple-precision floating-point
4669 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4670 | Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4673 float128
float128_mul( float128 a
, float128 b
)
4675 flag aSign
, bSign
, zSign
;
4676 int32 aExp
, bExp
, zExp
;
4677 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
, zSig3
;
4680 aSig1
= extractFloat128Frac1( a
);
4681 aSig0
= extractFloat128Frac0( a
);
4682 aExp
= extractFloat128Exp( a
);
4683 aSign
= extractFloat128Sign( a
);
4684 bSig1
= extractFloat128Frac1( b
);
4685 bSig0
= extractFloat128Frac0( b
);
4686 bExp
= extractFloat128Exp( b
);
4687 bSign
= extractFloat128Sign( b
);
4688 zSign
= aSign
^ bSign
;
4689 if ( aExp
== 0x7FFF ) {
4690 if ( ( aSig0
| aSig1
)
4691 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4692 return propagateFloat128NaN( a
, b
);
4694 if ( ( bExp
| bSig0
| bSig1
) == 0 ) goto invalid
;
4695 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4697 if ( bExp
== 0x7FFF ) {
4698 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4699 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4701 float_raise( float_flag_invalid
);
4702 z
.low
= float128_default_nan_low
;
4703 z
.high
= float128_default_nan_high
;
4706 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4709 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4710 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4713 if ( ( bSig0
| bSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4714 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4716 zExp
= aExp
+ bExp
- 0x4000;
4717 aSig0
|= LIT64( 0x0001000000000000 );
4718 shortShift128Left( bSig0
, bSig1
, 16, &bSig0
, &bSig1
);
4719 mul128To256( aSig0
, aSig1
, bSig0
, bSig1
, &zSig0
, &zSig1
, &zSig2
, &zSig3
);
4720 add128( zSig0
, zSig1
, aSig0
, aSig1
, &zSig0
, &zSig1
);
4721 zSig2
|= ( zSig3
!= 0 );
4722 if ( LIT64( 0x0002000000000000 ) <= zSig0
) {
4723 shift128ExtraRightJamming(
4724 zSig0
, zSig1
, zSig2
, 1, &zSig0
, &zSig1
, &zSig2
);
4727 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4731 /*----------------------------------------------------------------------------
4732 | Returns the result of dividing the quadruple-precision floating-point value
4733 | `a' by the corresponding value `b'. The operation is performed according to
4734 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4735 *----------------------------------------------------------------------------*/
4737 float128
float128_div( float128 a
, float128 b
)
4739 flag aSign
, bSign
, zSign
;
4740 int32 aExp
, bExp
, zExp
;
4741 bits64 aSig0
, aSig1
, bSig0
, bSig1
, zSig0
, zSig1
, zSig2
;
4742 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4745 aSig1
= extractFloat128Frac1( a
);
4746 aSig0
= extractFloat128Frac0( a
);
4747 aExp
= extractFloat128Exp( a
);
4748 aSign
= extractFloat128Sign( a
);
4749 bSig1
= extractFloat128Frac1( b
);
4750 bSig0
= extractFloat128Frac0( b
);
4751 bExp
= extractFloat128Exp( b
);
4752 bSign
= extractFloat128Sign( b
);
4753 zSign
= aSign
^ bSign
;
4754 if ( aExp
== 0x7FFF ) {
4755 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, b
);
4756 if ( bExp
== 0x7FFF ) {
4757 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4760 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4762 if ( bExp
== 0x7FFF ) {
4763 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4764 return packFloat128( zSign
, 0, 0, 0 );
4767 if ( ( bSig0
| bSig1
) == 0 ) {
4768 if ( ( aExp
| aSig0
| aSig1
) == 0 ) {
4770 float_raise( float_flag_invalid
);
4771 z
.low
= float128_default_nan_low
;
4772 z
.high
= float128_default_nan_high
;
4775 float_raise( float_flag_divbyzero
);
4776 return packFloat128( zSign
, 0x7FFF, 0, 0 );
4778 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4781 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( zSign
, 0, 0, 0 );
4782 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4784 zExp
= aExp
- bExp
+ 0x3FFD;
4786 aSig0
| LIT64( 0x0001000000000000 ), aSig1
, 15, &aSig0
, &aSig1
);
4788 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4789 if ( le128( bSig0
, bSig1
, aSig0
, aSig1
) ) {
4790 shift128Right( aSig0
, aSig1
, 1, &aSig0
, &aSig1
);
4793 zSig0
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4794 mul128By64To192( bSig0
, bSig1
, zSig0
, &term0
, &term1
, &term2
);
4795 sub192( aSig0
, aSig1
, 0, term0
, term1
, term2
, &rem0
, &rem1
, &rem2
);
4796 while ( (sbits64
) rem0
< 0 ) {
4798 add192( rem0
, rem1
, rem2
, 0, bSig0
, bSig1
, &rem0
, &rem1
, &rem2
);
4800 zSig1
= estimateDiv128To64( rem1
, rem2
, bSig0
);
4801 if ( ( zSig1
& 0x3FFF ) <= 4 ) {
4802 mul128By64To192( bSig0
, bSig1
, zSig1
, &term1
, &term2
, &term3
);
4803 sub192( rem1
, rem2
, 0, term1
, term2
, term3
, &rem1
, &rem2
, &rem3
);
4804 while ( (sbits64
) rem1
< 0 ) {
4806 add192( rem1
, rem2
, rem3
, 0, bSig0
, bSig1
, &rem1
, &rem2
, &rem3
);
4808 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4810 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 15, &zSig0
, &zSig1
, &zSig2
);
4811 return roundAndPackFloat128( zSign
, zExp
, zSig0
, zSig1
, zSig2
);
4815 /*----------------------------------------------------------------------------
4816 | Returns the remainder of the quadruple-precision floating-point value `a'
4817 | with respect to the corresponding value `b'. The operation is performed
4818 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4819 *----------------------------------------------------------------------------*/
4821 float128
float128_rem( float128 a
, float128 b
)
4823 flag aSign
, bSign
, zSign
;
4824 int32 aExp
, bExp
, expDiff
;
4825 bits64 aSig0
, aSig1
, bSig0
, bSig1
, q
, term0
, term1
, term2
;
4826 bits64 allZero
, alternateASig0
, alternateASig1
, sigMean1
;
4830 aSig1
= extractFloat128Frac1( a
);
4831 aSig0
= extractFloat128Frac0( a
);
4832 aExp
= extractFloat128Exp( a
);
4833 aSign
= extractFloat128Sign( a
);
4834 bSig1
= extractFloat128Frac1( b
);
4835 bSig0
= extractFloat128Frac0( b
);
4836 bExp
= extractFloat128Exp( b
);
4837 bSign
= extractFloat128Sign( b
);
4838 if ( aExp
== 0x7FFF ) {
4839 if ( ( aSig0
| aSig1
)
4840 || ( ( bExp
== 0x7FFF ) && ( bSig0
| bSig1
) ) ) {
4841 return propagateFloat128NaN( a
, b
);
4845 if ( bExp
== 0x7FFF ) {
4846 if ( bSig0
| bSig1
) return propagateFloat128NaN( a
, b
);
4850 if ( ( bSig0
| bSig1
) == 0 ) {
4852 float_raise( float_flag_invalid
);
4853 z
.low
= float128_default_nan_low
;
4854 z
.high
= float128_default_nan_high
;
4857 normalizeFloat128Subnormal( bSig0
, bSig1
, &bExp
, &bSig0
, &bSig1
);
4860 if ( ( aSig0
| aSig1
) == 0 ) return a
;
4861 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4863 expDiff
= aExp
- bExp
;
4864 if ( expDiff
< -1 ) return a
;
4866 aSig0
| LIT64( 0x0001000000000000 ),
4868 15 - ( expDiff
< 0 ),
4873 bSig0
| LIT64( 0x0001000000000000 ), bSig1
, 15, &bSig0
, &bSig1
);
4874 q
= le128( bSig0
, bSig1
, aSig0
, aSig1
);
4875 if ( q
) sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4877 while ( 0 < expDiff
) {
4878 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4879 q
= ( 4 < q
) ? q
- 4 : 0;
4880 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4881 shortShift192Left( term0
, term1
, term2
, 61, &term1
, &term2
, &allZero
);
4882 shortShift128Left( aSig0
, aSig1
, 61, &aSig0
, &allZero
);
4883 sub128( aSig0
, 0, term1
, term2
, &aSig0
, &aSig1
);
4886 if ( -64 < expDiff
) {
4887 q
= estimateDiv128To64( aSig0
, aSig1
, bSig0
);
4888 q
= ( 4 < q
) ? q
- 4 : 0;
4890 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4892 if ( expDiff
< 0 ) {
4893 shift128Right( aSig0
, aSig1
, - expDiff
, &aSig0
, &aSig1
);
4896 shortShift128Left( aSig0
, aSig1
, expDiff
, &aSig0
, &aSig1
);
4898 mul128By64To192( bSig0
, bSig1
, q
, &term0
, &term1
, &term2
);
4899 sub128( aSig0
, aSig1
, term1
, term2
, &aSig0
, &aSig1
);
4902 shift128Right( aSig0
, aSig1
, 12, &aSig0
, &aSig1
);
4903 shift128Right( bSig0
, bSig1
, 12, &bSig0
, &bSig1
);
4906 alternateASig0
= aSig0
;
4907 alternateASig1
= aSig1
;
4909 sub128( aSig0
, aSig1
, bSig0
, bSig1
, &aSig0
, &aSig1
);
4910 } while ( 0 <= (sbits64
) aSig0
);
4912 aSig0
, aSig1
, alternateASig0
, alternateASig1
, &sigMean0
, &sigMean1
);
4913 if ( ( sigMean0
< 0 )
4914 || ( ( ( sigMean0
| sigMean1
) == 0 ) && ( q
& 1 ) ) ) {
4915 aSig0
= alternateASig0
;
4916 aSig1
= alternateASig1
;
4918 zSign
= ( (sbits64
) aSig0
< 0 );
4919 if ( zSign
) sub128( 0, 0, aSig0
, aSig1
, &aSig0
, &aSig1
);
4921 normalizeRoundAndPackFloat128( aSign
^ zSign
, bExp
- 4, aSig0
, aSig1
);
4925 /*----------------------------------------------------------------------------
4926 | Returns the square root of the quadruple-precision floating-point value `a'.
4927 | The operation is performed according to the IEC/IEEE Standard for Binary
4928 | Floating-Point Arithmetic.
4929 *----------------------------------------------------------------------------*/
4931 float128
float128_sqrt( float128 a
)
4935 bits64 aSig0
, aSig1
, zSig0
, zSig1
, zSig2
, doubleZSig0
;
4936 bits64 rem0
, rem1
, rem2
, rem3
, term0
, term1
, term2
, term3
;
4939 aSig1
= extractFloat128Frac1( a
);
4940 aSig0
= extractFloat128Frac0( a
);
4941 aExp
= extractFloat128Exp( a
);
4942 aSign
= extractFloat128Sign( a
);
4943 if ( aExp
== 0x7FFF ) {
4944 if ( aSig0
| aSig1
) return propagateFloat128NaN( a
, a
);
4945 if ( ! aSign
) return a
;
4949 if ( ( aExp
| aSig0
| aSig1
) == 0 ) return a
;
4951 float_raise( float_flag_invalid
);
4952 z
.low
= float128_default_nan_low
;
4953 z
.high
= float128_default_nan_high
;
4957 if ( ( aSig0
| aSig1
) == 0 ) return packFloat128( 0, 0, 0, 0 );
4958 normalizeFloat128Subnormal( aSig0
, aSig1
, &aExp
, &aSig0
, &aSig1
);
4960 zExp
= ( ( aExp
- 0x3FFF )>>1 ) + 0x3FFE;
4961 aSig0
|= LIT64( 0x0001000000000000 );
4962 zSig0
= estimateSqrt32( aExp
, aSig0
>>17 );
4963 shortShift128Left( aSig0
, aSig1
, 13 - ( aExp
& 1 ), &aSig0
, &aSig1
);
4964 zSig0
= estimateDiv128To64( aSig0
, aSig1
, zSig0
<<32 ) + ( zSig0
<<30 );
4965 doubleZSig0
= zSig0
<<1;
4966 mul64To128( zSig0
, zSig0
, &term0
, &term1
);
4967 sub128( aSig0
, aSig1
, term0
, term1
, &rem0
, &rem1
);
4968 while ( (sbits64
) rem0
< 0 ) {
4971 add128( rem0
, rem1
, zSig0
>>63, doubleZSig0
| 1, &rem0
, &rem1
);
4973 zSig1
= estimateDiv128To64( rem1
, 0, doubleZSig0
);
4974 if ( ( zSig1
& 0x1FFF ) <= 5 ) {
4975 if ( zSig1
== 0 ) zSig1
= 1;
4976 mul64To128( doubleZSig0
, zSig1
, &term1
, &term2
);
4977 sub128( rem1
, 0, term1
, term2
, &rem1
, &rem2
);
4978 mul64To128( zSig1
, zSig1
, &term2
, &term3
);
4979 sub192( rem1
, rem2
, 0, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4980 while ( (sbits64
) rem1
< 0 ) {
4982 shortShift128Left( 0, zSig1
, 1, &term2
, &term3
);
4984 term2
|= doubleZSig0
;
4985 add192( rem1
, rem2
, rem3
, 0, term2
, term3
, &rem1
, &rem2
, &rem3
);
4987 zSig1
|= ( ( rem1
| rem2
| rem3
) != 0 );
4989 shift128ExtraRightJamming( zSig0
, zSig1
, 0, 14, &zSig0
, &zSig1
, &zSig2
);
4990 return roundAndPackFloat128( 0, zExp
, zSig0
, zSig1
, zSig2
);
4994 /*----------------------------------------------------------------------------
4995 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4996 | the corresponding value `b', and 0 otherwise. The comparison is performed
4997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4998 *----------------------------------------------------------------------------*/
5000 flag
float128_eq( float128 a
, float128 b
)
5003 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5004 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5005 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5006 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5008 if ( float128_is_signaling_nan( a
)
5009 || float128_is_signaling_nan( b
) ) {
5010 float_raise( float_flag_invalid
);
5016 && ( ( a
.high
== b
.high
)
5018 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5023 /*----------------------------------------------------------------------------
5024 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5025 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5028 *----------------------------------------------------------------------------*/
5030 flag
float128_le( float128 a
, float128 b
)
5034 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5035 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5036 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5037 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5039 float_raise( float_flag_invalid
);
5042 aSign
= extractFloat128Sign( a
);
5043 bSign
= extractFloat128Sign( b
);
5044 if ( aSign
!= bSign
) {
5047 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5051 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5052 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5056 /*----------------------------------------------------------------------------
5057 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5058 | the corresponding value `b', and 0 otherwise. The comparison is performed
5059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5060 *----------------------------------------------------------------------------*/
5062 flag
float128_lt( float128 a
, float128 b
)
5066 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5067 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5068 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5069 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5071 float_raise( float_flag_invalid
);
5074 aSign
= extractFloat128Sign( a
);
5075 bSign
= extractFloat128Sign( b
);
5076 if ( aSign
!= bSign
) {
5079 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5083 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5084 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);
5088 /*----------------------------------------------------------------------------
5089 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5090 | the corresponding value `b', and 0 otherwise. The invalid exception is
5091 | raised if either operand is a NaN. Otherwise, the comparison is performed
5092 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5095 flag
float128_eq_signaling( float128 a
, float128 b
)
5098 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5099 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5100 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5101 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5103 float_raise( float_flag_invalid
);
5108 && ( ( a
.high
== b
.high
)
5110 && ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) == 0 ) )
5115 /*----------------------------------------------------------------------------
5116 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5117 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5118 | cause an exception. Otherwise, the comparison is performed according to the
5119 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5122 flag
float128_le_quiet( float128 a
, float128 b
)
5126 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5127 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5128 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5129 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5131 if ( float128_is_signaling_nan( a
)
5132 || float128_is_signaling_nan( b
) ) {
5133 float_raise( float_flag_invalid
);
5137 aSign
= extractFloat128Sign( a
);
5138 bSign
= extractFloat128Sign( b
);
5139 if ( aSign
!= bSign
) {
5142 || ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5146 aSign
? le128( b
.high
, b
.low
, a
.high
, a
.low
)
5147 : le128( a
.high
, a
.low
, b
.high
, b
.low
);
5151 /*----------------------------------------------------------------------------
5152 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5153 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5154 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5155 | Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5158 flag
float128_lt_quiet( float128 a
, float128 b
)
5162 if ( ( ( extractFloat128Exp( a
) == 0x7FFF )
5163 && ( extractFloat128Frac0( a
) | extractFloat128Frac1( a
) ) )
5164 || ( ( extractFloat128Exp( b
) == 0x7FFF )
5165 && ( extractFloat128Frac0( b
) | extractFloat128Frac1( b
) ) )
5167 if ( float128_is_signaling_nan( a
)
5168 || float128_is_signaling_nan( b
) ) {
5169 float_raise( float_flag_invalid
);
5173 aSign
= extractFloat128Sign( a
);
5174 bSign
= extractFloat128Sign( b
);
5175 if ( aSign
!= bSign
) {
5178 && ( ( ( (bits64
) ( ( a
.high
| b
.high
)<<1 ) ) | a
.low
| b
.low
)
5182 aSign
? lt128( b
.high
, b
.low
, a
.high
, a
.low
)
5183 : lt128( a
.high
, a
.low
, b
.high
, b
.low
);