softfloat/SoftFloat-3/source/OLD-softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "milieu.h"
  34 #include "softfloat.h"
  35
  36 /*----------------------------------------------------------------------------
  37 | Primitive arithmetic functions, including multi-word arithmetic, and
  38 | division and square root approximations.  (Can be specialized to target if
  39 | desired.)
  40 *----------------------------------------------------------------------------*/
  41 #include "softfloat-macros"
  42
  43 /*----------------------------------------------------------------------------
  44 | Functions and definitions to determine:  (1) whether tininess for underflow
  45 | is detected before or after rounding by default, (2) what (if anything)
  46 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  47 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  48 | are propagated from function inputs to output.  These details are target-
  49 | specific.
  50 *----------------------------------------------------------------------------*/
  51 #include "softfloat-specialize"
  52
  53 #ifdef FLOATX80
  54
  55 /*----------------------------------------------------------------------------
  56 | Returns the fraction bits of the extended double-precision floating-point
  57 | value `a'.
  58 *----------------------------------------------------------------------------*/
  59
  60 INLINE bits64 extractFloatx80Frac( floatx80 a )
  61 {
  62
  63     return a.low;
  64
  65 }
  66
  67 /*----------------------------------------------------------------------------
  68 | Returns the exponent bits of the extended double-precision floating-point
  69 | value `a'.
  70 *----------------------------------------------------------------------------*/
  71
  72 INLINE int32 extractFloatx80Exp( floatx80 a )
  73 {
  74
  75     return a.high & 0x7FFF;
  76
  77 }
  78
  79 /*----------------------------------------------------------------------------
  80 | Returns the sign bit of the extended double-precision floating-point value
  81 | `a'.
  82 *----------------------------------------------------------------------------*/
  83
  84 INLINE flag extractFloatx80Sign( floatx80 a )
  85 {
  86
  87     return a.high>>15;
  88
  89 }
  90
  91 /*----------------------------------------------------------------------------
  92 | Normalizes the subnormal extended double-precision floating-point value
  93 | represented by the denormalized significand `aSig'.  The normalized exponent
  94 | and significand are stored at the locations pointed to by `zExpPtr' and
  95 | `zSigPtr', respectively.
  96 *----------------------------------------------------------------------------*/
  97
  98 static void
  99  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 100 {
 101     int8 shiftCount;
 102
 103     shiftCount = countLeadingZeros64( aSig );
 104     *zSigPtr = aSig<<shiftCount;
 105     *zExpPtr = 1 - shiftCount;
 106
 107 }
 108
 109 /*----------------------------------------------------------------------------
 110 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 111 | extended double-precision floating-point value, returning the result.
 112 *----------------------------------------------------------------------------*/
 113
 114 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 115 {
 116     floatx80 z;
 117
 118     z.low = zSig;
 119     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 120     return z;
 121
 122 }
 123
 124 /*----------------------------------------------------------------------------
 125 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 126 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 127 | and returns the proper extended double-precision floating-point value
 128 | corresponding to the abstract input.  Ordinarily, the abstract value is
 129 | rounded and packed into the extended double-precision format, with the
 130 | inexact exception raised if the abstract input cannot be represented
 131 | exactly.  However, if the abstract value is too large, the overflow and
 132 | inexact exceptions are raised and an infinity or maximal finite value is
 133 | returned.  If the abstract value is too small, the input value is rounded to
 134 | a subnormal number, and the underflow and inexact exceptions are raised if
 135 | the abstract input cannot be represented exactly as a subnormal extended
 136 | double-precision floating-point number.
 137 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 138 | number of bits as single or double precision, respectively.  Otherwise, the
 139 | result is rounded to the full precision of the extended double-precision
 140 | format.
 141 |     The input significand must be normalized or smaller.  If the input
 142 | significand is not normalized, `zExp' must be 0; in that case, the result
 143 | returned is a subnormal number, and it must not require rounding.  The
 144 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 145 | Floating-Point Arithmetic.
 146 *----------------------------------------------------------------------------*/
 147
 148 static floatx80
 149  roundAndPackFloatx80(
 150      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 151  )
 152 {
 153     int8 roundingMode;
 154     flag roundNearestEven, increment, isTiny;
 155     int64 roundIncrement, roundMask, roundBits;
 156
 157     roundingMode = float_rounding_mode;
 158     roundNearestEven = ( roundingMode == float_round_nearest_even );
 159     if ( roundingPrecision == 80 ) goto precision80;
 160     if ( roundingPrecision == 64 ) {
 161         roundIncrement = LIT64( 0x0000000000000400 );
 162         roundMask = LIT64( 0x00000000000007FF );
 163     }
 164     else if ( roundingPrecision == 32 ) {
 165         roundIncrement = LIT64( 0x0000008000000000 );
 166         roundMask = LIT64( 0x000000FFFFFFFFFF );
 167     }
 168     else {
 169         goto precision80;
 170     }
 171     zSig0 |= ( zSig1 != 0 );
 172     if ( ! roundNearestEven ) {
 173         if ( roundingMode == float_round_to_zero ) {
 174             roundIncrement = 0;
 175         }
 176         else {
 177             roundIncrement = roundMask;
 178             if ( zSign ) {
 179                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 180             }
 181             else {
 182                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 183             }
 184         }
 185     }
 186     roundBits = zSig0 & roundMask;
 187     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 188         if (    ( 0x7FFE < zExp )
 189              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 190            ) {
 191             goto overflow;
 192         }
 193         if ( zExp <= 0 ) {
 194             isTiny =
 195                    ( float_detect_tininess == float_tininess_before_rounding )
 196                 || ( zExp < 0 )
 197                 || ( zSig0 <= zSig0 + roundIncrement );
 198             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 199             zExp = 0;
 200             roundBits = zSig0 & roundMask;
 201             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 202             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 203             zSig0 += roundIncrement;
 204             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 205             roundIncrement = roundMask + 1;
 206             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 207                 roundMask |= roundIncrement;
 208             }
 209             zSig0 &= ~ roundMask;
 210             return packFloatx80( zSign, zExp, zSig0 );
 211         }
 212     }
 213     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 214     zSig0 += roundIncrement;
 215     if ( zSig0 < roundIncrement ) {
 216         ++zExp;
 217         zSig0 = LIT64( 0x8000000000000000 );
 218     }
 219     roundIncrement = roundMask + 1;
 220     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 221         roundMask |= roundIncrement;
 222     }
 223     zSig0 &= ~ roundMask;
 224     if ( zSig0 == 0 ) zExp = 0;
 225     return packFloatx80( zSign, zExp, zSig0 );
 226  precision80:
 227     increment = ( (sbits64) zSig1 < 0 );
 228     if ( ! roundNearestEven ) {
 229         if ( roundingMode == float_round_to_zero ) {
 230             increment = 0;
 231         }
 232         else {
 233             if ( zSign ) {
 234                 increment = ( roundingMode == float_round_down ) && zSig1;
 235             }
 236             else {
 237                 increment = ( roundingMode == float_round_up ) && zSig1;
 238             }
 239         }
 240     }
 241     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 242         if (    ( 0x7FFE < zExp )
 243              || (    ( zExp == 0x7FFE )
 244                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 245                   && increment
 246                 )
 247            ) {
 248             roundMask = 0;
 249  overflow:
 250             float_raise( float_flag_overflow | float_flag_inexact );
 251             if (    ( roundingMode == float_round_to_zero )
 252                  || ( zSign && ( roundingMode == float_round_up ) )
 253                  || ( ! zSign && ( roundingMode == float_round_down ) )
 254                ) {
 255                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 256             }
 257             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 258         }
 259         if ( zExp <= 0 ) {
 260             isTiny =
 261                    ( float_detect_tininess == float_tininess_before_rounding )
 262                 || ( zExp < 0 )
 263                 || ! increment
 264                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 265             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 266             zExp = 0;
 267             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 268             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 269             if ( roundNearestEven ) {
 270                 increment = ( (sbits64) zSig1 < 0 );
 271             }
 272             else {
 273                 if ( zSign ) {
 274                     increment = ( roundingMode == float_round_down ) && zSig1;
 275                 }
 276                 else {
 277                     increment = ( roundingMode == float_round_up ) && zSig1;
 278                 }
 279             }
 280             if ( increment ) {
 281                 ++zSig0;
 282                 zSig0 &=
 283                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 284                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 285             }
 286             return packFloatx80( zSign, zExp, zSig0 );
 287         }
 288     }
 289     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 290     if ( increment ) {
 291         ++zSig0;
 292         if ( zSig0 == 0 ) {
 293             ++zExp;
 294             zSig0 = LIT64( 0x8000000000000000 );
 295         }
 296         else {
 297             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 298         }
 299     }
 300     else {
 301         if ( zSig0 == 0 ) zExp = 0;
 302     }
 303     return packFloatx80( zSign, zExp, zSig0 );
 304
 305 }
 306
 307 /*----------------------------------------------------------------------------
 308 | Takes an abstract floating-point value having sign `zSign', exponent
 309 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 310 | and returns the proper extended double-precision floating-point value
 311 | corresponding to the abstract input.  This routine is just like
 312 | `roundAndPackFloatx80' except that the input significand does not have to be
 313 | normalized.
 314 *----------------------------------------------------------------------------*/
 315
 316 static floatx80
 317  normalizeRoundAndPackFloatx80(
 318      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 319  )
 320 {
 321     int8 shiftCount;
 322
 323     if ( zSig0 == 0 ) {
 324         zSig0 = zSig1;
 325         zSig1 = 0;
 326         zExp -= 64;
 327     }
 328     shiftCount = countLeadingZeros64( zSig0 );
 329     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 330     zExp -= shiftCount;
 331     return
 332         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 333
 334 }
 335
 336 #endif
 337
 338 #ifdef FLOAT128
 339
 340 /*----------------------------------------------------------------------------
 341 | Returns the least-significant 64 fraction bits of the quadruple-precision
 342 | floating-point value `a'.
 343 *----------------------------------------------------------------------------*/
 344
 345 INLINE bits64 extractFloat128Frac1( float128 a )
 346 {
 347
 348     return a.low;
 349
 350 }
 351
 352 /*----------------------------------------------------------------------------
 353 | Returns the most-significant 48 fraction bits of the quadruple-precision
 354 | floating-point value `a'.
 355 *----------------------------------------------------------------------------*/
 356
 357 INLINE bits64 extractFloat128Frac0( float128 a )
 358 {
 359
 360     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 361
 362 }
 363
 364 /*----------------------------------------------------------------------------
 365 | Returns the exponent bits of the quadruple-precision floating-point value
 366 | `a'.
 367 *----------------------------------------------------------------------------*/
 368
 369 INLINE int32 extractFloat128Exp( float128 a )
 370 {
 371
 372     return ( a.high>>48 ) & 0x7FFF;
 373
 374 }
 375
 376 /*----------------------------------------------------------------------------
 377 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 378 *----------------------------------------------------------------------------*/
 379
 380 INLINE flag extractFloat128Sign( float128 a )
 381 {
 382
 383     return a.high>>63;
 384
 385 }
 386
 387 /*----------------------------------------------------------------------------
 388 | Normalizes the subnormal quadruple-precision floating-point value
 389 | represented by the denormalized significand formed by the concatenation of
 390 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 391 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 392 | significand are stored at the location pointed to by `zSig0Ptr', and the
 393 | least significant 64 bits of the normalized significand are stored at the
 394 | location pointed to by `zSig1Ptr'.
 395 *----------------------------------------------------------------------------*/
 396
 397 static void
 398  normalizeFloat128Subnormal(
 399      bits64 aSig0,
 400      bits64 aSig1,
 401      int32 *zExpPtr,
 402      bits64 *zSig0Ptr,
 403      bits64 *zSig1Ptr
 404  )
 405 {
 406     int8 shiftCount;
 407
 408     if ( aSig0 == 0 ) {
 409         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 410         if ( shiftCount < 0 ) {
 411             *zSig0Ptr = aSig1>>( - shiftCount );
 412             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 413         }
 414         else {
 415             *zSig0Ptr = aSig1<<shiftCount;
 416             *zSig1Ptr = 0;
 417         }
 418         *zExpPtr = - shiftCount - 63;
 419     }
 420     else {
 421         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 422         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 423         *zExpPtr = 1 - shiftCount;
 424     }
 425
 426 }
 427
 428 /*----------------------------------------------------------------------------
 429 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 430 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 431 | floating-point value, returning the result.  After being shifted into the
 432 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 433 | added together to form the most significant 32 bits of the result.  This
 434 | means that any integer portion of `zSig0' will be added into the exponent.
 435 | Since a properly normalized significand will have an integer portion equal
 436 | to 1, the `zExp' input should be 1 less than the desired result exponent
 437 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 438 | significand.
 439 *----------------------------------------------------------------------------*/
 440
 441 INLINE float128
 442  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 443 {
 444     float128 z;
 445
 446     z.low = zSig1;
 447     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 448     return z;
 449
 450 }
 451
 452 /*----------------------------------------------------------------------------
 453 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 454 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 455 | and `zSig2', and returns the proper quadruple-precision floating-point value
 456 | corresponding to the abstract input.  Ordinarily, the abstract value is
 457 | simply rounded and packed into the quadruple-precision format, with the
 458 | inexact exception raised if the abstract input cannot be represented
 459 | exactly.  However, if the abstract value is too large, the overflow and
 460 | inexact exceptions are raised and an infinity or maximal finite value is
 461 | returned.  If the abstract value is too small, the input value is rounded to
 462 | a subnormal number, and the underflow and inexact exceptions are raised if
 463 | the abstract input cannot be represented exactly as a subnormal quadruple-
 464 | precision floating-point number.
 465 |     The input significand must be normalized or smaller.  If the input
 466 | significand is not normalized, `zExp' must be 0; in that case, the result
 467 | returned is a subnormal number, and it must not require rounding.  In the
 468 | usual case that the input significand is normalized, `zExp' must be 1 less
 469 | than the ``true'' floating-point exponent.  The handling of underflow and
 470 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 471 *----------------------------------------------------------------------------*/
 472
 473 static float128
 474  roundAndPackFloat128(
 475      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 476 {
 477     int8 roundingMode;
 478     flag roundNearestEven, increment, isTiny;
 479
 480     roundingMode = float_rounding_mode;
 481     roundNearestEven = ( roundingMode == float_round_nearest_even );
 482     increment = ( (sbits64) zSig2 < 0 );
 483     if ( ! roundNearestEven ) {
 484         if ( roundingMode == float_round_to_zero ) {
 485             increment = 0;
 486         }
 487         else {
 488             if ( zSign ) {
 489                 increment = ( roundingMode == float_round_down ) && zSig2;
 490             }
 491             else {
 492                 increment = ( roundingMode == float_round_up ) && zSig2;
 493             }
 494         }
 495     }
 496     if ( 0x7FFD <= (bits32) zExp ) {
 497         if (    ( 0x7FFD < zExp )
 498              || (    ( zExp == 0x7FFD )
 499                   && eq128(
 500                          LIT64( 0x0001FFFFFFFFFFFF ),
 501                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 502                          zSig0,
 503                          zSig1
 504                      )
 505                   && increment
 506                 )
 507            ) {
 508             float_raise( float_flag_overflow | float_flag_inexact );
 509             if (    ( roundingMode == float_round_to_zero )
 510                  || ( zSign && ( roundingMode == float_round_up ) )
 511                  || ( ! zSign && ( roundingMode == float_round_down ) )
 512                ) {
 513                 return
 514                     packFloat128(
 515                         zSign,
 516                         0x7FFE,
 517                         LIT64( 0x0000FFFFFFFFFFFF ),
 518                         LIT64( 0xFFFFFFFFFFFFFFFF )
 519                     );
 520             }
 521             return packFloat128( zSign, 0x7FFF, 0, 0 );
 522         }
 523         if ( zExp < 0 ) {
 524             isTiny =
 525                    ( float_detect_tininess == float_tininess_before_rounding )
 526                 || ( zExp < -1 )
 527                 || ! increment
 528                 || lt128(
 529                        zSig0,
 530                        zSig1,
 531                        LIT64( 0x0001FFFFFFFFFFFF ),
 532                        LIT64( 0xFFFFFFFFFFFFFFFF )
 533                    );
 534             shift128ExtraRightJamming(
 535                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 536             zExp = 0;
 537             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
 538             if ( roundNearestEven ) {
 539                 increment = ( (sbits64) zSig2 < 0 );
 540             }
 541             else {
 542                 if ( zSign ) {
 543                     increment = ( roundingMode == float_round_down ) && zSig2;
 544                 }
 545                 else {
 546                     increment = ( roundingMode == float_round_up ) && zSig2;
 547                 }
 548             }
 549         }
 550     }
 551     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
 552     if ( increment ) {
 553         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 554         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 555     }
 556     else {
 557         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 558     }
 559     return packFloat128( zSign, zExp, zSig0, zSig1 );
 560
 561 }
 562
 563 /*----------------------------------------------------------------------------
 564 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 565 | and significand formed by the concatenation of `zSig0' and `zSig1', and
 566 | returns the proper quadruple-precision floating-point value corresponding
 567 | to the abstract input.  This routine is just like `roundAndPackFloat128'
 568 | except that the input significand has fewer bits and does not have to be
 569 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
 570 | point exponent.
 571 *----------------------------------------------------------------------------*/
 572
 573 static float128
 574  normalizeRoundAndPackFloat128(
 575      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 576 {
 577     int8 shiftCount;
 578     bits64 zSig2;
 579
 580     if ( zSig0 == 0 ) {
 581         zSig0 = zSig1;
 582         zSig1 = 0;
 583         zExp -= 64;
 584     }
 585     shiftCount = countLeadingZeros64( zSig0 ) - 15;
 586     if ( 0 <= shiftCount ) {
 587         zSig2 = 0;
 588         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 589     }
 590     else {
 591         shift128ExtraRightJamming(
 592             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
 593     }
 594     zExp -= shiftCount;
 595     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
 596
 597 }
 598
 599 #endif
 600
 601 #ifdef FLOATX80
 602
 603 /*----------------------------------------------------------------------------
 604 | Returns the result of converting the 32-bit two's complement integer `a'
 605 | to the extended double-precision floating-point format.  The conversion
 606 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 607 | Arithmetic.
 608 *----------------------------------------------------------------------------*/
 609
 610 floatx80 int32_to_floatx80( int32 a )
 611 {
 612     flag zSign;
 613     uint32 absA;
 614     int8 shiftCount;
 615     bits64 zSig;
 616
 617     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 618     zSign = ( a < 0 );
 619     absA = zSign ? - a : a;
 620     shiftCount = countLeadingZeros32( absA ) + 32;
 621     zSig = absA;
 622     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
 623
 624 }
 625
 626 #endif
 627
 628 #ifdef FLOAT128
 629
 630 /*----------------------------------------------------------------------------
 631 | Returns the result of converting the 32-bit two's complement integer `a' to
 632 | the quadruple-precision floating-point format.  The conversion is performed
 633 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 634 *----------------------------------------------------------------------------*/
 635
 636 float128 int32_to_float128( int32 a )
 637 {
 638     flag zSign;
 639     uint32 absA;
 640     int8 shiftCount;
 641     bits64 zSig0;
 642
 643     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 644     zSign = ( a < 0 );
 645     absA = zSign ? - a : a;
 646     shiftCount = countLeadingZeros32( absA ) + 17;
 647     zSig0 = absA;
 648     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
 649
 650 }
 651
 652 #endif
 653
 654 #ifdef FLOATX80
 655
 656 /*----------------------------------------------------------------------------
 657 | Returns the result of converting the 64-bit two's complement integer `a'
 658 | to the extended double-precision floating-point format.  The conversion
 659 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 660 | Arithmetic.
 661 *----------------------------------------------------------------------------*/
 662
 663 floatx80 int64_to_floatx80( int64 a )
 664 {
 665     flag zSign;
 666     uint64 absA;
 667     int8 shiftCount;
 668
 669     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
 670     zSign = ( a < 0 );
 671     absA = zSign ? - a : a;
 672     shiftCount = countLeadingZeros64( absA );
 673     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
 674
 675 }
 676
 677 #endif
 678
 679 #ifdef FLOAT128
 680
 681 /*----------------------------------------------------------------------------
 682 | Returns the result of converting the 64-bit two's complement integer `a' to
 683 | the quadruple-precision floating-point format.  The conversion is performed
 684 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 685 *----------------------------------------------------------------------------*/
 686
 687 float128 int64_to_float128( int64 a )
 688 {
 689     flag zSign;
 690     uint64 absA;
 691     int8 shiftCount;
 692     int32 zExp;
 693     bits64 zSig0, zSig1;
 694
 695     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
 696     zSign = ( a < 0 );
 697     absA = zSign ? - a : a;
 698     shiftCount = countLeadingZeros64( absA ) + 49;
 699     zExp = 0x406E - shiftCount;
 700     if ( 64 <= shiftCount ) {
 701         zSig1 = 0;
 702         zSig0 = absA;
 703         shiftCount -= 64;
 704     }
 705     else {
 706         zSig1 = absA;
 707         zSig0 = 0;
 708     }
 709     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 710     return packFloat128( zSign, zExp, zSig0, zSig1 );
 711
 712 }
 713
 714 #endif
 715
 716 #ifdef FLOATX80
 717
 718 /*----------------------------------------------------------------------------
 719 | Returns the result of converting the single-precision floating-point value
 720 | `a' to the extended double-precision floating-point format.  The conversion
 721 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 722 | Arithmetic.
 723 *----------------------------------------------------------------------------*/
 724
 725 floatx80 float32_to_floatx80( float32 a )
 726 {
 727     flag aSign;
 728     int16 aExp;
 729     bits32 aSig;
 730
 731     aSig = extractFloat32Frac( a );
 732     aExp = extractFloat32Exp( a );
 733     aSign = extractFloat32Sign( a );
 734     if ( aExp == 0xFF ) {
 735         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
 736         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 737     }
 738     if ( aExp == 0 ) {
 739         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 740         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 741     }
 742     aSig |= 0x00800000;
 743     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
 744
 745 }
 746
 747 #endif
 748
 749 #ifdef FLOAT128
 750
 751 /*----------------------------------------------------------------------------
 752 | Returns the result of converting the single-precision floating-point value
 753 | `a' to the double-precision floating-point format.  The conversion is
 754 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 755 | Arithmetic.
 756 *----------------------------------------------------------------------------*/
 757
 758 float128 float32_to_float128( float32 a )
 759 {
 760     flag aSign;
 761     int16 aExp;
 762     bits32 aSig;
 763
 764     aSig = extractFloat32Frac( a );
 765     aExp = extractFloat32Exp( a );
 766     aSign = extractFloat32Sign( a );
 767     if ( aExp == 0xFF ) {
 768         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
 769         return packFloat128( aSign, 0x7FFF, 0, 0 );
 770     }
 771     if ( aExp == 0 ) {
 772         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
 773         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
 774         --aExp;
 775     }
 776     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
 777
 778 }
 779
 780 #endif
 781
 782 #ifdef FLOATX80
 783
 784 /*----------------------------------------------------------------------------
 785 | Returns the result of converting the double-precision floating-point value
 786 | `a' to the extended double-precision floating-point format.  The conversion
 787 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
 788 | Arithmetic.
 789 *----------------------------------------------------------------------------*/
 790
 791 floatx80 float64_to_floatx80( float64 a )
 792 {
 793     flag aSign;
 794     int16 aExp;
 795     bits64 aSig;
 796
 797     aSig = extractFloat64Frac( a );
 798     aExp = extractFloat64Exp( a );
 799     aSign = extractFloat64Sign( a );
 800     if ( aExp == 0x7FF ) {
 801         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
 802         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 803     }
 804     if ( aExp == 0 ) {
 805         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
 806         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 807     }
 808     return
 809         packFloatx80(
 810             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
 811
 812 }
 813
 814 #endif
 815
 816 #ifdef FLOAT128
 817
 818 /*----------------------------------------------------------------------------
 819 | Returns the result of converting the double-precision floating-point value
 820 | `a' to the quadruple-precision floating-point format.  The conversion is
 821 | performed according to the IEC/IEEE Standard for Binary Floating-Point
 822 | Arithmetic.
 823 *----------------------------------------------------------------------------*/
 824
 825 float128 float64_to_float128( float64 a )
 826 {
 827     flag aSign;
 828     int16 aExp;
 829     bits64 aSig, zSig0, zSig1;
 830
 831     aSig = extractFloat64Frac( a );
 832     aExp = extractFloat64Exp( a );
 833     aSign = extractFloat64Sign( a );
 834     if ( aExp == 0x7FF ) {
 835         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
 836         return packFloat128( aSign, 0x7FFF, 0, 0 );
 837     }
 838     if ( aExp == 0 ) {
 839         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
 840         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
 841         --aExp;
 842     }
 843     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
 844     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
 845
 846 }
 847
 848 #endif
 849
 850 #ifdef FLOATX80
 851
 852 /*----------------------------------------------------------------------------
 853 | Returns the result of converting the extended double-precision floating-
 854 | point value `a' to the 32-bit two's complement integer format.  The
 855 | conversion is performed according to the IEC/IEEE Standard for Binary
 856 | Floating-Point Arithmetic---which means in particular that the conversion
 857 | is rounded according to the current rounding mode.  If `a' is a NaN, the
 858 | largest positive integer is returned.  Otherwise, if the conversion
 859 | overflows, the largest integer with the same sign as `a' is returned.
 860 *----------------------------------------------------------------------------*/
 861
 862 int32 floatx80_to_int32( floatx80 a )
 863 {
 864     flag aSign;
 865     int32 aExp, shiftCount;
 866     bits64 aSig;
 867
 868     aSig = extractFloatx80Frac( a );
 869     aExp = extractFloatx80Exp( a );
 870     aSign = extractFloatx80Sign( a );
 871     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
 872     shiftCount = 0x4037 - aExp;
 873     if ( shiftCount <= 0 ) shiftCount = 1;
 874     shift64RightJamming( aSig, shiftCount, &aSig );
 875     return roundAndPackInt32( aSign, aSig );
 876
 877 }
 878
 879 /*----------------------------------------------------------------------------
 880 | Returns the result of converting the extended double-precision floating-
 881 | point value `a' to the 32-bit two's complement integer format.  The
 882 | conversion is performed according to the IEC/IEEE Standard for Binary
 883 | Floating-Point Arithmetic, except that the conversion is always rounded
 884 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
 885 | Otherwise, if the conversion overflows, the largest integer with the same
 886 | sign as `a' is returned.
 887 *----------------------------------------------------------------------------*/
 888
 889 int32 floatx80_to_int32_round_to_zero( floatx80 a )
 890 {
 891     flag aSign;
 892     int32 aExp, shiftCount;
 893     bits64 aSig, savedASig;
 894     int32 z;
 895
 896     aSig = extractFloatx80Frac( a );
 897     aExp = extractFloatx80Exp( a );
 898     aSign = extractFloatx80Sign( a );
 899     if ( 0x401E < aExp ) {
 900         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
 901         goto invalid;
 902     }
 903     else if ( aExp < 0x3FFF ) {
 904         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
 905         return 0;
 906     }
 907     shiftCount = 0x403E - aExp;
 908     savedASig = aSig;
 909     aSig >>= shiftCount;
 910     z = aSig;
 911     if ( aSign ) z = - z;
 912     if ( ( z < 0 ) ^ aSign ) {
 913  invalid:
 914         float_raise( float_flag_invalid );
 915         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 916     }
 917     if ( ( aSig<<shiftCount ) != savedASig ) {
 918         float_exception_flags |= float_flag_inexact;
 919     }
 920     return z;
 921
 922 }
 923
 924 /*----------------------------------------------------------------------------
 925 | Returns the result of converting the extended double-precision floating-
 926 | point value `a' to the 64-bit two's complement integer format.  The
 927 | conversion is performed according to the IEC/IEEE Standard for Binary
 928 | Floating-Point Arithmetic---which means in particular that the conversion
 929 | is rounded according to the current rounding mode.  If `a' is a NaN,
 930 | the largest positive integer is returned.  Otherwise, if the conversion
 931 | overflows, the largest integer with the same sign as `a' is returned.
 932 *----------------------------------------------------------------------------*/
 933
 934 int64 floatx80_to_int64( floatx80 a )
 935 {
 936     flag aSign;
 937     int32 aExp, shiftCount;
 938     bits64 aSig, aSigExtra;
 939
 940     aSig = extractFloatx80Frac( a );
 941     aExp = extractFloatx80Exp( a );
 942     aSign = extractFloatx80Sign( a );
 943     shiftCount = 0x403E - aExp;
 944     if ( shiftCount <= 0 ) {
 945         if ( shiftCount ) {
 946             float_raise( float_flag_invalid );
 947             if (    ! aSign
 948                  || (    ( aExp == 0x7FFF )
 949                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
 950                ) {
 951                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 952             }
 953             return (sbits64) LIT64( 0x8000000000000000 );
 954         }
 955         aSigExtra = 0;
 956     }
 957     else {
 958         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
 959     }
 960     return roundAndPackInt64( aSign, aSig, aSigExtra );
 961
 962 }
 963
 964 /*----------------------------------------------------------------------------
 965 | Returns the result of converting the extended double-precision floating-
 966 | point value `a' to the 64-bit two's complement integer format.  The
 967 | conversion is performed according to the IEC/IEEE Standard for Binary
 968 | Floating-Point Arithmetic, except that the conversion is always rounded
 969 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
 970 | Otherwise, if the conversion overflows, the largest integer with the same
 971 | sign as `a' is returned.
 972 *----------------------------------------------------------------------------*/
 973
 974 int64 floatx80_to_int64_round_to_zero( floatx80 a )
 975 {
 976     flag aSign;
 977     int32 aExp, shiftCount;
 978     bits64 aSig;
 979     int64 z;
 980
 981     aSig = extractFloatx80Frac( a );
 982     aExp = extractFloatx80Exp( a );
 983     aSign = extractFloatx80Sign( a );
 984     shiftCount = aExp - 0x403E;
 985     if ( 0 <= shiftCount ) {
 986         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
 987         if ( ( a.high != 0xC03E ) || aSig ) {
 988             float_raise( float_flag_invalid );
 989             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
 990                 return LIT64( 0x7FFFFFFFFFFFFFFF );
 991             }
 992         }
 993         return (sbits64) LIT64( 0x8000000000000000 );
 994     }
 995     else if ( aExp < 0x3FFF ) {
 996         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
 997         return 0;
 998     }
 999     z = aSig>>( - shiftCount );
1000     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
1001         float_exception_flags |= float_flag_inexact;
1002     }
1003     if ( aSign ) z = - z;
1004     return z;
1005
1006 }
1007
1008 /*----------------------------------------------------------------------------
1009 | Returns the result of converting the extended double-precision floating-
1010 | point value `a' to the single-precision floating-point format.  The
1011 | conversion is performed according to the IEC/IEEE Standard for Binary
1012 | Floating-Point Arithmetic.
1013 *----------------------------------------------------------------------------*/
1014
1015 float32 floatx80_to_float32( floatx80 a )
1016 {
1017     flag aSign;
1018     int32 aExp;
1019     bits64 aSig;
1020
1021     aSig = extractFloatx80Frac( a );
1022     aExp = extractFloatx80Exp( a );
1023     aSign = extractFloatx80Sign( a );
1024     if ( aExp == 0x7FFF ) {
1025         if ( (bits64) ( aSig<<1 ) ) {
1026             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
1027         }
1028         return packFloat32( aSign, 0xFF, 0 );
1029     }
1030     shift64RightJamming( aSig, 33, &aSig );
1031     if ( aExp || aSig ) aExp -= 0x3F81;
1032     return roundAndPackFloat32( aSign, aExp, aSig );
1033
1034 }
1035
1036 /*----------------------------------------------------------------------------
1037 | Returns the result of converting the extended double-precision floating-
1038 | point value `a' to the double-precision floating-point format.  The
1039 | conversion is performed according to the IEC/IEEE Standard for Binary
1040 | Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1042
1043 float64 floatx80_to_float64( floatx80 a )
1044 {
1045     flag aSign;
1046     int32 aExp;
1047     bits64 aSig, zSig;
1048
1049     aSig = extractFloatx80Frac( a );
1050     aExp = extractFloatx80Exp( a );
1051     aSign = extractFloatx80Sign( a );
1052     if ( aExp == 0x7FFF ) {
1053         if ( (bits64) ( aSig<<1 ) ) {
1054             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
1055         }
1056         return packFloat64( aSign, 0x7FF, 0 );
1057     }
1058     shift64RightJamming( aSig, 1, &zSig );
1059     if ( aExp || aSig ) aExp -= 0x3C01;
1060     return roundAndPackFloat64( aSign, aExp, zSig );
1061
1062 }
1063
1064 #ifdef FLOAT128
1065
1066 /*----------------------------------------------------------------------------
1067 | Returns the result of converting the extended double-precision floating-
1068 | point value `a' to the quadruple-precision floating-point format.  The
1069 | conversion is performed according to the IEC/IEEE Standard for Binary
1070 | Floating-Point Arithmetic.
1071 *----------------------------------------------------------------------------*/
1072
1073 float128 floatx80_to_float128( floatx80 a )
1074 {
1075     flag aSign;
1076     int16 aExp;
1077     bits64 aSig, zSig0, zSig1;
1078
1079     aSig = extractFloatx80Frac( a );
1080     aExp = extractFloatx80Exp( a );
1081     aSign = extractFloatx80Sign( a );
1082     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
1083         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
1084     }
1085     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
1086     return packFloat128( aSign, aExp, zSig0, zSig1 );
1087
1088 }
1089
1090 #endif
1091
1092 /*----------------------------------------------------------------------------
1093 | Rounds the extended double-precision floating-point value `a' to an integer,
1094 | and returns the result as an extended quadruple-precision floating-point
1095 | value.  The operation is performed according to the IEC/IEEE Standard for
1096 | Binary Floating-Point Arithmetic.
1097 *----------------------------------------------------------------------------*/
1098
1099 floatx80 floatx80_round_to_int( floatx80 a )
1100 {
1101     flag aSign;
1102     int32 aExp;
1103     bits64 lastBitMask, roundBitsMask;
1104     int8 roundingMode;
1105     floatx80 z;
1106
1107     aExp = extractFloatx80Exp( a );
1108     if ( 0x403E <= aExp ) {
1109         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
1110             return propagateFloatx80NaN( a, a );
1111         }
1112         return a;
1113     }
1114     if ( aExp < 0x3FFF ) {
1115         if (    ( aExp == 0 )
1116              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
1117             return a;
1118         }
1119         float_exception_flags |= float_flag_inexact;
1120         aSign = extractFloatx80Sign( a );
1121         switch ( float_rounding_mode ) {
1122          case float_round_nearest_even:
1123             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
1124                ) {
1125                 return
1126                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
1127             }
1128             break;
1129          case float_round_down:
1130             return
1131                   aSign ?
1132                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
1133                 : packFloatx80( 0, 0, 0 );
1134          case float_round_up:
1135             return
1136                   aSign ? packFloatx80( 1, 0, 0 )
1137                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
1138         }
1139         return packFloatx80( aSign, 0, 0 );
1140     }
1141     lastBitMask = 1;
1142     lastBitMask <<= 0x403E - aExp;
1143     roundBitsMask = lastBitMask - 1;
1144     z = a;
1145     roundingMode = float_rounding_mode;
1146     if ( roundingMode == float_round_nearest_even ) {
1147         z.low += lastBitMask>>1;
1148         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
1149     }
1150     else if ( roundingMode != float_round_to_zero ) {
1151         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1152             z.low += roundBitsMask;
1153         }
1154     }
1155     z.low &= ~ roundBitsMask;
1156     if ( z.low == 0 ) {
1157         ++z.high;
1158         z.low = LIT64( 0x8000000000000000 );
1159     }
1160     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
1161     return z;
1162
1163 }
1164
1165 /*----------------------------------------------------------------------------
1166 | Returns the result of adding the absolute values of the extended double-
1167 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
1168 | negated before being returned.  `zSign' is ignored if the result is a NaN.
1169 | The addition is performed according to the IEC/IEEE Standard for Binary
1170 | Floating-Point Arithmetic.
1171 *----------------------------------------------------------------------------*/
1172
1173 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
1174 {
1175     int32 aExp, bExp, zExp;
1176     bits64 aSig, bSig, zSig0, zSig1;
1177     int32 expDiff;
1178
1179     aSig = extractFloatx80Frac( a );
1180     aExp = extractFloatx80Exp( a );
1181     bSig = extractFloatx80Frac( b );
1182     bExp = extractFloatx80Exp( b );
1183     expDiff = aExp - bExp;
1184     if ( 0 < expDiff ) {
1185         if ( aExp == 0x7FFF ) {
1186             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
1187             return a;
1188         }
1189         if ( bExp == 0 ) --expDiff;
1190         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
1191         zExp = aExp;
1192     }
1193     else if ( expDiff < 0 ) {
1194         if ( bExp == 0x7FFF ) {
1195             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1196             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1197         }
1198         if ( aExp == 0 ) ++expDiff;
1199         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
1200         zExp = bExp;
1201     }
1202     else {
1203         if ( aExp == 0x7FFF ) {
1204             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
1205                 return propagateFloatx80NaN( a, b );
1206             }
1207             return a;
1208         }
1209         zSig1 = 0;
1210         zSig0 = aSig + bSig;
1211         if ( aExp == 0 ) {
1212             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
1213             goto roundAndPack;
1214         }
1215         zExp = aExp;
1216         goto shiftRight1;
1217     }
1218     zSig0 = aSig + bSig;
1219     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
1220  shiftRight1:
1221     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
1222     zSig0 |= LIT64( 0x8000000000000000 );
1223     ++zExp;
1224  roundAndPack:
1225     return
1226         roundAndPackFloatx80(
1227             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1228
1229 }
1230
1231 /*----------------------------------------------------------------------------
1232 | Returns the result of subtracting the absolute values of the extended
1233 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
1234 | difference is negated before being returned.  `zSign' is ignored if the
1235 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1236 | Standard for Binary Floating-Point Arithmetic.
1237 *----------------------------------------------------------------------------*/
1238
1239 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
1240 {
1241     int32 aExp, bExp, zExp;
1242     bits64 aSig, bSig, zSig0, zSig1;
1243     int32 expDiff;
1244     floatx80 z;
1245
1246     aSig = extractFloatx80Frac( a );
1247     aExp = extractFloatx80Exp( a );
1248     bSig = extractFloatx80Frac( b );
1249     bExp = extractFloatx80Exp( b );
1250     expDiff = aExp - bExp;
1251     if ( 0 < expDiff ) goto aExpBigger;
1252     if ( expDiff < 0 ) goto bExpBigger;
1253     if ( aExp == 0x7FFF ) {
1254         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
1255             return propagateFloatx80NaN( a, b );
1256         }
1257         float_raise( float_flag_invalid );
1258         z.low = floatx80_default_nan_low;
1259         z.high = floatx80_default_nan_high;
1260         return z;
1261     }
1262     if ( aExp == 0 ) {
1263         aExp = 1;
1264         bExp = 1;
1265     }
1266     zSig1 = 0;
1267     if ( bSig < aSig ) goto aBigger;
1268     if ( aSig < bSig ) goto bBigger;
1269     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
1270  bExpBigger:
1271     if ( bExp == 0x7FFF ) {
1272         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1273         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
1274     }
1275     if ( aExp == 0 ) ++expDiff;
1276     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
1277  bBigger:
1278     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
1279     zExp = bExp;
1280     zSign ^= 1;
1281     goto normalizeRoundAndPack;
1282  aExpBigger:
1283     if ( aExp == 0x7FFF ) {
1284         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
1285         return a;
1286     }
1287     if ( bExp == 0 ) --expDiff;
1288     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
1289  aBigger:
1290     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
1291     zExp = aExp;
1292  normalizeRoundAndPack:
1293     return
1294         normalizeRoundAndPackFloatx80(
1295             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1296
1297 }
1298
1299 /*----------------------------------------------------------------------------
1300 | Returns the result of adding the extended double-precision floating-point
1301 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
1302 | Standard for Binary Floating-Point Arithmetic.
1303 *----------------------------------------------------------------------------*/
1304
1305 floatx80 floatx80_add( floatx80 a, floatx80 b )
1306 {
1307     flag aSign, bSign;
1308
1309     aSign = extractFloatx80Sign( a );
1310     bSign = extractFloatx80Sign( b );
1311     if ( aSign == bSign ) {
1312         return addFloatx80Sigs( a, b, aSign );
1313     }
1314     else {
1315         return subFloatx80Sigs( a, b, aSign );
1316     }
1317
1318 }
1319
1320 /*----------------------------------------------------------------------------
1321 | Returns the result of subtracting the extended double-precision floating-
1322 | point values `a' and `b'.  The operation is performed according to the
1323 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1324 *----------------------------------------------------------------------------*/
1325
1326 floatx80 floatx80_sub( floatx80 a, floatx80 b )
1327 {
1328     flag aSign, bSign;
1329
1330     aSign = extractFloatx80Sign( a );
1331     bSign = extractFloatx80Sign( b );
1332     if ( aSign == bSign ) {
1333         return subFloatx80Sigs( a, b, aSign );
1334     }
1335     else {
1336         return addFloatx80Sigs( a, b, aSign );
1337     }
1338
1339 }
1340
1341 /*----------------------------------------------------------------------------
1342 | Returns the result of multiplying the extended double-precision floating-
1343 | point values `a' and `b'.  The operation is performed according to the
1344 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1345 *----------------------------------------------------------------------------*/
1346
1347 floatx80 floatx80_mul( floatx80 a, floatx80 b )
1348 {
1349     flag aSign, bSign, zSign;
1350     int32 aExp, bExp, zExp;
1351     bits64 aSig, bSig, zSig0, zSig1;
1352     floatx80 z;
1353
1354     aSig = extractFloatx80Frac( a );
1355     aExp = extractFloatx80Exp( a );
1356     aSign = extractFloatx80Sign( a );
1357     bSig = extractFloatx80Frac( b );
1358     bExp = extractFloatx80Exp( b );
1359     bSign = extractFloatx80Sign( b );
1360     zSign = aSign ^ bSign;
1361     if ( aExp == 0x7FFF ) {
1362         if (    (bits64) ( aSig<<1 )
1363              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
1364             return propagateFloatx80NaN( a, b );
1365         }
1366         if ( ( bExp | bSig ) == 0 ) goto invalid;
1367         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1368     }
1369     if ( bExp == 0x7FFF ) {
1370         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1371         if ( ( aExp | aSig ) == 0 ) {
1372  invalid:
1373             float_raise( float_flag_invalid );
1374             z.low = floatx80_default_nan_low;
1375             z.high = floatx80_default_nan_high;
1376             return z;
1377         }
1378         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1379     }
1380     if ( aExp == 0 ) {
1381         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
1382         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
1383     }
1384     if ( bExp == 0 ) {
1385         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
1386         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
1387     }
1388     zExp = aExp + bExp - 0x3FFE;
1389     mul64To128( aSig, bSig, &zSig0, &zSig1 );
1390     if ( 0 < (sbits64) zSig0 ) {
1391         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
1392         --zExp;
1393     }
1394     return
1395         roundAndPackFloatx80(
1396             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1397
1398 }
1399
1400 /*----------------------------------------------------------------------------
1401 | Returns the result of dividing the extended double-precision floating-point
1402 | value `a' by the corresponding value `b'.  The operation is performed
1403 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1404 *----------------------------------------------------------------------------*/
1405
1406 floatx80 floatx80_div( floatx80 a, floatx80 b )
1407 {
1408     flag aSign, bSign, zSign;
1409     int32 aExp, bExp, zExp;
1410     bits64 aSig, bSig, zSig0, zSig1;
1411     bits64 rem0, rem1, rem2, term0, term1, term2;
1412     floatx80 z;
1413
1414     aSig = extractFloatx80Frac( a );
1415     aExp = extractFloatx80Exp( a );
1416     aSign = extractFloatx80Sign( a );
1417     bSig = extractFloatx80Frac( b );
1418     bExp = extractFloatx80Exp( b );
1419     bSign = extractFloatx80Sign( b );
1420     zSign = aSign ^ bSign;
1421     if ( aExp == 0x7FFF ) {
1422         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
1423         if ( bExp == 0x7FFF ) {
1424             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1425             goto invalid;
1426         }
1427         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1428     }
1429     if ( bExp == 0x7FFF ) {
1430         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1431         return packFloatx80( zSign, 0, 0 );
1432     }
1433     if ( bExp == 0 ) {
1434         if ( bSig == 0 ) {
1435             if ( ( aExp | aSig ) == 0 ) {
1436  invalid:
1437                 float_raise( float_flag_invalid );
1438                 z.low = floatx80_default_nan_low;
1439                 z.high = floatx80_default_nan_high;
1440                 return z;
1441             }
1442             float_raise( float_flag_divbyzero );
1443             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1444         }
1445         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
1446     }
1447     if ( aExp == 0 ) {
1448         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
1449         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
1450     }
1451     zExp = aExp - bExp + 0x3FFE;
1452     rem1 = 0;
1453     if ( bSig <= aSig ) {
1454         shift128Right( aSig, 0, 1, &aSig, &rem1 );
1455         ++zExp;
1456     }
1457     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
1458     mul64To128( bSig, zSig0, &term0, &term1 );
1459     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
1460     while ( (sbits64) rem0 < 0 ) {
1461         --zSig0;
1462         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
1463     }
1464     zSig1 = estimateDiv128To64( rem1, 0, bSig );
1465     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
1466         mul64To128( bSig, zSig1, &term1, &term2 );
1467         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
1468         while ( (sbits64) rem1 < 0 ) {
1469             --zSig1;
1470             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
1471         }
1472         zSig1 |= ( ( rem1 | rem2 ) != 0 );
1473     }
1474     return
1475         roundAndPackFloatx80(
1476             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
1477
1478 }
1479
1480 /*----------------------------------------------------------------------------
1481 | Returns the remainder of the extended double-precision floating-point value
1482 | `a' with respect to the corresponding value `b'.  The operation is performed
1483 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1484 *----------------------------------------------------------------------------*/
1485
1486 floatx80 floatx80_rem( floatx80 a, floatx80 b )
1487 {
1488     flag aSign, bSign, zSign;
1489     int32 aExp, bExp, expDiff;
1490     bits64 aSig0, aSig1, bSig;
1491     bits64 q, term0, term1, alternateASig0, alternateASig1;
1492     floatx80 z;
1493
1494     aSig0 = extractFloatx80Frac( a );
1495     aExp = extractFloatx80Exp( a );
1496     aSign = extractFloatx80Sign( a );
1497     bSig = extractFloatx80Frac( b );
1498     bExp = extractFloatx80Exp( b );
1499     bSign = extractFloatx80Sign( b );
1500     if ( aExp == 0x7FFF ) {
1501         if (    (bits64) ( aSig0<<1 )
1502              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
1503             return propagateFloatx80NaN( a, b );
1504         }
1505         goto invalid;
1506     }
1507     if ( bExp == 0x7FFF ) {
1508         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
1509         return a;
1510     }
1511     if ( bExp == 0 ) {
1512         if ( bSig == 0 ) {
1513  invalid:
1514             float_raise( float_flag_invalid );
1515             z.low = floatx80_default_nan_low;
1516             z.high = floatx80_default_nan_high;
1517             return z;
1518         }
1519         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
1520     }
1521     if ( aExp == 0 ) {
1522         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
1523         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
1524     }
1525     bSig |= LIT64( 0x8000000000000000 );
1526     zSign = aSign;
1527     expDiff = aExp - bExp;
1528     aSig1 = 0;
1529     if ( expDiff < 0 ) {
1530         if ( expDiff < -1 ) return a;
1531         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
1532         expDiff = 0;
1533     }
1534     q = ( bSig <= aSig0 );
1535     if ( q ) aSig0 -= bSig;
1536     expDiff -= 64;
1537     while ( 0 < expDiff ) {
1538         q = estimateDiv128To64( aSig0, aSig1, bSig );
1539         q = ( 2 < q ) ? q - 2 : 0;
1540         mul64To128( bSig, q, &term0, &term1 );
1541         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
1542         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
1543         expDiff -= 62;
1544     }
1545     expDiff += 64;
1546     if ( 0 < expDiff ) {
1547         q = estimateDiv128To64( aSig0, aSig1, bSig );
1548         q = ( 2 < q ) ? q - 2 : 0;
1549         q >>= 64 - expDiff;
1550         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
1551         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
1552         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
1553         while ( le128( term0, term1, aSig0, aSig1 ) ) {
1554             ++q;
1555             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
1556         }
1557     }
1558     else {
1559         term1 = 0;
1560         term0 = bSig;
1561     }
1562     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
1563     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
1564          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
1565               && ( q & 1 ) )
1566        ) {
1567         aSig0 = alternateASig0;
1568         aSig1 = alternateASig1;
1569         zSign = ! zSign;
1570     }
1571     return
1572         normalizeRoundAndPackFloatx80(
1573             80, zSign, bExp + expDiff, aSig0, aSig1 );
1574
1575 }
1576
1577 /*----------------------------------------------------------------------------
1578 | Returns the square root of the extended double-precision floating-point
1579 | value `a'.  The operation is performed according to the IEC/IEEE Standard
1580 | for Binary Floating-Point Arithmetic.
1581 *----------------------------------------------------------------------------*/
1582
1583 floatx80 floatx80_sqrt( floatx80 a )
1584 {
1585     flag aSign;
1586     int32 aExp, zExp;
1587     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
1588     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
1589     floatx80 z;
1590
1591     aSig0 = extractFloatx80Frac( a );
1592     aExp = extractFloatx80Exp( a );
1593     aSign = extractFloatx80Sign( a );
1594     if ( aExp == 0x7FFF ) {
1595         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
1596         if ( ! aSign ) return a;
1597         goto invalid;
1598     }
1599     if ( aSign ) {
1600         if ( ( aExp | aSig0 ) == 0 ) return a;
1601  invalid:
1602         float_raise( float_flag_invalid );
1603         z.low = floatx80_default_nan_low;
1604         z.high = floatx80_default_nan_high;
1605         return z;
1606     }
1607     if ( aExp == 0 ) {
1608         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
1609         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
1610     }
1611     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
1612     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
1613     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
1614     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
1615     doubleZSig0 = zSig0<<1;
1616     mul64To128( zSig0, zSig0, &term0, &term1 );
1617     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
1618     while ( (sbits64) rem0 < 0 ) {
1619         --zSig0;
1620         doubleZSig0 -= 2;
1621         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
1622     }
1623     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
1624     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
1625         if ( zSig1 == 0 ) zSig1 = 1;
1626         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
1627         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
1628         mul64To128( zSig1, zSig1, &term2, &term3 );
1629         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
1630         while ( (sbits64) rem1 < 0 ) {
1631             --zSig1;
1632             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
1633             term3 |= 1;
1634             term2 |= doubleZSig0;
1635             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
1636         }
1637         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
1638     }
1639     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
1640     zSig0 |= doubleZSig0;
1641     return
1642         roundAndPackFloatx80(
1643             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
1644
1645 }
1646
1647 /*----------------------------------------------------------------------------
1648 | Returns 1 if the extended double-precision floating-point value `a' is
1649 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
1650 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1651 | Arithmetic.
1652 *----------------------------------------------------------------------------*/
1653
1654 flag floatx80_eq( floatx80 a, floatx80 b )
1655 {
1656
1657     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
1658               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1659          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
1660               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1661        ) {
1662         if (    floatx80_is_signaling_nan( a )
1663              || floatx80_is_signaling_nan( b ) ) {
1664             float_raise( float_flag_invalid );
1665         }
1666         return 0;
1667     }
1668     return
1669            ( a.low == b.low )
1670         && (    ( a.high == b.high )
1671              || (    ( a.low == 0 )
1672                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
1673            );
1674
1675 }
1676
1677 /*----------------------------------------------------------------------------
1678 | Returns 1 if the extended double-precision floating-point value `a' is
1679 | less than or equal to the corresponding value `b', and 0 otherwise.  The
1680 | comparison is performed according to the IEC/IEEE Standard for Binary
1681 | Floating-Point Arithmetic.
1682 *----------------------------------------------------------------------------*/
1683
1684 flag floatx80_le( floatx80 a, floatx80 b )
1685 {
1686     flag aSign, bSign;
1687
1688     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
1689               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1690          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
1691               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1692        ) {
1693         float_raise( float_flag_invalid );
1694         return 0;
1695     }
1696     aSign = extractFloatx80Sign( a );
1697     bSign = extractFloatx80Sign( b );
1698     if ( aSign != bSign ) {
1699         return
1700                aSign
1701             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1702                  == 0 );
1703     }
1704     return
1705           aSign ? le128( b.high, b.low, a.high, a.low )
1706         : le128( a.high, a.low, b.high, b.low );
1707
1708 }
1709
1710 /*----------------------------------------------------------------------------
1711 | Returns 1 if the extended double-precision floating-point value `a' is
1712 | less than the corresponding value `b', and 0 otherwise.  The comparison
1713 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1714 | Arithmetic.
1715 *----------------------------------------------------------------------------*/
1716
1717 flag floatx80_lt( floatx80 a, floatx80 b )
1718 {
1719     flag aSign, bSign;
1720
1721     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
1722               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1723          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
1724               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1725        ) {
1726         float_raise( float_flag_invalid );
1727         return 0;
1728     }
1729     aSign = extractFloatx80Sign( a );
1730     bSign = extractFloatx80Sign( b );
1731     if ( aSign != bSign ) {
1732         return
1733                aSign
1734             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1735                  != 0 );
1736     }
1737     return
1738           aSign ? lt128( b.high, b.low, a.high, a.low )
1739         : lt128( a.high, a.low, b.high, b.low );
1740
1741 }
1742
1743 /*----------------------------------------------------------------------------
1744 | Returns 1 if the extended double-precision floating-point value `a' is equal
1745 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
1746 | raised if either operand is a NaN.  Otherwise, the comparison is performed
1747 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1748 *----------------------------------------------------------------------------*/
1749
1750 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
1751 {
1752
1753     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
1754               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1755          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
1756               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1757        ) {
1758         float_raise( float_flag_invalid );
1759         return 0;
1760     }
1761     return
1762            ( a.low == b.low )
1763         && (    ( a.high == b.high )
1764              || (    ( a.low == 0 )
1765                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
1766            );
1767
1768 }
1769
1770 /*----------------------------------------------------------------------------
1771 | Returns 1 if the extended double-precision floating-point value `a' is less
1772 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
1773 | do not cause an exception.  Otherwise, the comparison is performed according
1774 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1775 *----------------------------------------------------------------------------*/
1776
1777 flag floatx80_le_quiet( floatx80 a, floatx80 b )
1778 {
1779     flag aSign, bSign;
1780
1781     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
1782               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1783          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
1784               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1785        ) {
1786         if (    floatx80_is_signaling_nan( a )
1787              || floatx80_is_signaling_nan( b ) ) {
1788             float_raise( float_flag_invalid );
1789         }
1790         return 0;
1791     }
1792     aSign = extractFloatx80Sign( a );
1793     bSign = extractFloatx80Sign( b );
1794     if ( aSign != bSign ) {
1795         return
1796                aSign
1797             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1798                  == 0 );
1799     }
1800     return
1801           aSign ? le128( b.high, b.low, a.high, a.low )
1802         : le128( a.high, a.low, b.high, b.low );
1803
1804 }
1805
1806 /*----------------------------------------------------------------------------
1807 | Returns 1 if the extended double-precision floating-point value `a' is less
1808 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
1809 | an exception.  Otherwise, the comparison is performed according to the
1810 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1811 *----------------------------------------------------------------------------*/
1812
1813 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
1814 {
1815     flag aSign, bSign;
1816
1817     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
1818               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
1819          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
1820               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
1821        ) {
1822         if (    floatx80_is_signaling_nan( a )
1823              || floatx80_is_signaling_nan( b ) ) {
1824             float_raise( float_flag_invalid );
1825         }
1826         return 0;
1827     }
1828     aSign = extractFloatx80Sign( a );
1829     bSign = extractFloatx80Sign( b );
1830     if ( aSign != bSign ) {
1831         return
1832                aSign
1833             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
1834                  != 0 );
1835     }
1836     return
1837           aSign ? lt128( b.high, b.low, a.high, a.low )
1838         : lt128( a.high, a.low, b.high, b.low );
1839
1840 }
1841
1842 #endif
1843
1844 #ifdef FLOAT128
1845
1846 /*----------------------------------------------------------------------------
1847 | Returns the result of converting the quadruple-precision floating-point
1848 | value `a' to the 32-bit two's complement integer format.  The conversion
1849 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1850 | Arithmetic---which means in particular that the conversion is rounded
1851 | according to the current rounding mode.  If `a' is a NaN, the largest
1852 | positive integer is returned.  Otherwise, if the conversion overflows, the
1853 | largest integer with the same sign as `a' is returned.
1854 *----------------------------------------------------------------------------*/
1855
1856 int32 float128_to_int32( float128 a )
1857 {
1858     flag aSign;
1859     int32 aExp, shiftCount;
1860     bits64 aSig0, aSig1;
1861
1862     aSig1 = extractFloat128Frac1( a );
1863     aSig0 = extractFloat128Frac0( a );
1864     aExp = extractFloat128Exp( a );
1865     aSign = extractFloat128Sign( a );
1866     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
1867     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
1868     aSig0 |= ( aSig1 != 0 );
1869     shiftCount = 0x4028 - aExp;
1870     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
1871     return roundAndPackInt32( aSign, aSig0 );
1872
1873 }
1874
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of converting the quadruple-precision floating-point
1877 | value `a' to the 32-bit two's complement integer format.  The conversion
1878 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1879 | Arithmetic, except that the conversion is always rounded toward zero.  If
1880 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1881 | conversion overflows, the largest integer with the same sign as `a' is
1882 | returned.
1883 *----------------------------------------------------------------------------*/
1884
1885 int32 float128_to_int32_round_to_zero( float128 a )
1886 {
1887     flag aSign;
1888     int32 aExp, shiftCount;
1889     bits64 aSig0, aSig1, savedASig;
1890     int32 z;
1891
1892     aSig1 = extractFloat128Frac1( a );
1893     aSig0 = extractFloat128Frac0( a );
1894     aExp = extractFloat128Exp( a );
1895     aSign = extractFloat128Sign( a );
1896     aSig0 |= ( aSig1 != 0 );
1897     if ( 0x401E < aExp ) {
1898         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
1899         goto invalid;
1900     }
1901     else if ( aExp < 0x3FFF ) {
1902         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
1903         return 0;
1904     }
1905     aSig0 |= LIT64( 0x0001000000000000 );
1906     shiftCount = 0x402F - aExp;
1907     savedASig = aSig0;
1908     aSig0 >>= shiftCount;
1909     z = aSig0;
1910     if ( aSign ) z = - z;
1911     if ( ( z < 0 ) ^ aSign ) {
1912  invalid:
1913         float_raise( float_flag_invalid );
1914         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
1915     }
1916     if ( ( aSig0<<shiftCount ) != savedASig ) {
1917         float_exception_flags |= float_flag_inexact;
1918     }
1919     return z;
1920
1921 }
1922
1923 /*----------------------------------------------------------------------------
1924 | Returns the result of converting the quadruple-precision floating-point
1925 | value `a' to the 64-bit two's complement integer format.  The conversion
1926 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1927 | Arithmetic---which means in particular that the conversion is rounded
1928 | according to the current rounding mode.  If `a' is a NaN, the largest
1929 | positive integer is returned.  Otherwise, if the conversion overflows, the
1930 | largest integer with the same sign as `a' is returned.
1931 *----------------------------------------------------------------------------*/
1932
1933 int64 float128_to_int64( float128 a )
1934 {
1935     flag aSign;
1936     int32 aExp, shiftCount;
1937     bits64 aSig0, aSig1;
1938
1939     aSig1 = extractFloat128Frac1( a );
1940     aSig0 = extractFloat128Frac0( a );
1941     aExp = extractFloat128Exp( a );
1942     aSign = extractFloat128Sign( a );
1943     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
1944     shiftCount = 0x402F - aExp;
1945     if ( shiftCount <= 0 ) {
1946         if ( 0x403E < aExp ) {
1947             float_raise( float_flag_invalid );
1948             if (    ! aSign
1949                  || (    ( aExp == 0x7FFF )
1950                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
1951                     )
1952                ) {
1953                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1954             }
1955             return (sbits64) LIT64( 0x8000000000000000 );
1956         }
1957         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
1958     }
1959     else {
1960         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
1961     }
1962     return roundAndPackInt64( aSign, aSig0, aSig1 );
1963
1964 }
1965
1966 /*----------------------------------------------------------------------------
1967 | Returns the result of converting the quadruple-precision floating-point
1968 | value `a' to the 64-bit two's complement integer format.  The conversion
1969 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1970 | Arithmetic, except that the conversion is always rounded toward zero.
1971 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1972 | the conversion overflows, the largest integer with the same sign as `a' is
1973 | returned.
1974 *----------------------------------------------------------------------------*/
1975
1976 int64 float128_to_int64_round_to_zero( float128 a )
1977 {
1978     flag aSign;
1979     int32 aExp, shiftCount;
1980     bits64 aSig0, aSig1;
1981     int64 z;
1982
1983     aSig1 = extractFloat128Frac1( a );
1984     aSig0 = extractFloat128Frac0( a );
1985     aExp = extractFloat128Exp( a );
1986     aSign = extractFloat128Sign( a );
1987     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
1988     shiftCount = aExp - 0x402F;
1989     if ( 0 < shiftCount ) {
1990         if ( 0x403E <= aExp ) {
1991             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
1992             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
1993                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
1994                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
1995             }
1996             else {
1997                 float_raise( float_flag_invalid );
1998                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
1999                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2000                 }
2001             }
2002             return (sbits64) LIT64( 0x8000000000000000 );
2003         }
2004         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
2005         if ( (bits64) ( aSig1<<shiftCount ) ) {
2006             float_exception_flags |= float_flag_inexact;
2007         }
2008     }
2009     else {
2010         if ( aExp < 0x3FFF ) {
2011             if ( aExp | aSig0 | aSig1 ) {
2012                 float_exception_flags |= float_flag_inexact;
2013             }
2014             return 0;
2015         }
2016         z = aSig0>>( - shiftCount );
2017         if (    aSig1
2018              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
2019             float_exception_flags |= float_flag_inexact;
2020         }
2021     }
2022     if ( aSign ) z = - z;
2023     return z;
2024
2025 }
2026
2027 /*----------------------------------------------------------------------------
2028 | Returns the result of converting the quadruple-precision floating-point
2029 | value `a' to the single-precision floating-point format.  The conversion
2030 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2031 | Arithmetic.
2032 *----------------------------------------------------------------------------*/
2033
2034 float32 float128_to_float32( float128 a )
2035 {
2036     flag aSign;
2037     int32 aExp;
2038     bits64 aSig0, aSig1;
2039     bits32 zSig;
2040
2041     aSig1 = extractFloat128Frac1( a );
2042     aSig0 = extractFloat128Frac0( a );
2043     aExp = extractFloat128Exp( a );
2044     aSign = extractFloat128Sign( a );
2045     if ( aExp == 0x7FFF ) {
2046         if ( aSig0 | aSig1 ) {
2047             return commonNaNToFloat32( float128ToCommonNaN( a ) );
2048         }
2049         return packFloat32( aSign, 0xFF, 0 );
2050     }
2051     aSig0 |= ( aSig1 != 0 );
2052     shift64RightJamming( aSig0, 18, &aSig0 );
2053     zSig = aSig0;
2054     if ( aExp || zSig ) {
2055         zSig |= 0x40000000;
2056         aExp -= 0x3F81;
2057     }
2058     return roundAndPackFloat32( aSign, aExp, zSig );
2059
2060 }
2061
2062 /*----------------------------------------------------------------------------
2063 | Returns the result of converting the quadruple-precision floating-point
2064 | value `a' to the double-precision floating-point format.  The conversion
2065 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2066 | Arithmetic.
2067 *----------------------------------------------------------------------------*/
2068
2069 float64 float128_to_float64( float128 a )
2070 {
2071     flag aSign;
2072     int32 aExp;
2073     bits64 aSig0, aSig1;
2074
2075     aSig1 = extractFloat128Frac1( a );
2076     aSig0 = extractFloat128Frac0( a );
2077     aExp = extractFloat128Exp( a );
2078     aSign = extractFloat128Sign( a );
2079     if ( aExp == 0x7FFF ) {
2080         if ( aSig0 | aSig1 ) {
2081             return commonNaNToFloat64( float128ToCommonNaN( a ) );
2082         }
2083         return packFloat64( aSign, 0x7FF, 0 );
2084     }
2085     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
2086     aSig0 |= ( aSig1 != 0 );
2087     if ( aExp || aSig0 ) {
2088         aSig0 |= LIT64( 0x4000000000000000 );
2089         aExp -= 0x3C01;
2090     }
2091     return roundAndPackFloat64( aSign, aExp, aSig0 );
2092
2093 }
2094
2095 #ifdef FLOATX80
2096
2097 /*----------------------------------------------------------------------------
2098 | Returns the result of converting the quadruple-precision floating-point
2099 | value `a' to the extended double-precision floating-point format.  The
2100 | conversion is performed according to the IEC/IEEE Standard for Binary
2101 | Floating-Point Arithmetic.
2102 *----------------------------------------------------------------------------*/
2103
2104 floatx80 float128_to_floatx80( float128 a )
2105 {
2106     flag aSign;
2107     int32 aExp;
2108     bits64 aSig0, aSig1;
2109
2110     aSig1 = extractFloat128Frac1( a );
2111     aSig0 = extractFloat128Frac0( a );
2112     aExp = extractFloat128Exp( a );
2113     aSign = extractFloat128Sign( a );
2114     if ( aExp == 0x7FFF ) {
2115         if ( aSig0 | aSig1 ) {
2116             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
2117         }
2118         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2119     }
2120     if ( aExp == 0 ) {
2121         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
2122         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2123     }
2124     else {
2125         aSig0 |= LIT64( 0x0001000000000000 );
2126     }
2127     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
2128     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
2129
2130 }
2131
2132 #endif
2133
2134 /*----------------------------------------------------------------------------
2135 | Rounds the quadruple-precision floating-point value `a' to an integer, and
2136 | returns the result as a quadruple-precision floating-point value.  The
2137 | operation is performed according to the IEC/IEEE Standard for Binary
2138 | Floating-Point Arithmetic.
2139 *----------------------------------------------------------------------------*/
2140
2141 float128 float128_round_to_int( float128 a )
2142 {
2143     flag aSign;
2144     int32 aExp;
2145     bits64 lastBitMask, roundBitsMask;
2146     int8 roundingMode;
2147     float128 z;
2148
2149     aExp = extractFloat128Exp( a );
2150     if ( 0x402F <= aExp ) {
2151         if ( 0x406F <= aExp ) {
2152             if (    ( aExp == 0x7FFF )
2153                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
2154                ) {
2155                 return propagateFloat128NaN( a, a );
2156             }
2157             return a;
2158         }
2159         lastBitMask = 1;
2160         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
2161         roundBitsMask = lastBitMask - 1;
2162         z = a;
2163         roundingMode = float_rounding_mode;
2164         if ( roundingMode == float_round_nearest_even ) {
2165             if ( lastBitMask ) {
2166                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
2167                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
2168             }
2169             else {
2170                 if ( (sbits64) z.low < 0 ) {
2171                     ++z.high;
2172                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
2173                 }
2174             }
2175         }
2176         else if ( roundingMode != float_round_to_zero ) {
2177             if (   extractFloat128Sign( z )
2178                  ^ ( roundingMode == float_round_up ) ) {
2179                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
2180             }
2181         }
2182         z.low &= ~ roundBitsMask;
2183     }
2184     else {
2185         if ( aExp < 0x3FFF ) {
2186             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
2187             float_exception_flags |= float_flag_inexact;
2188             aSign = extractFloat128Sign( a );
2189             switch ( float_rounding_mode ) {
2190              case float_round_nearest_even:
2191                 if (    ( aExp == 0x3FFE )
2192                      && (   extractFloat128Frac0( a )
2193                           | extractFloat128Frac1( a ) )
2194                    ) {
2195                     return packFloat128( aSign, 0x3FFF, 0, 0 );
2196                 }
2197                 break;
2198              case float_round_down:
2199                 return
2200                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
2201                     : packFloat128( 0, 0, 0, 0 );
2202              case float_round_up:
2203                 return
2204                       aSign ? packFloat128( 1, 0, 0, 0 )
2205                     : packFloat128( 0, 0x3FFF, 0, 0 );
2206             }
2207             return packFloat128( aSign, 0, 0, 0 );
2208         }
2209         lastBitMask = 1;
2210         lastBitMask <<= 0x402F - aExp;
2211         roundBitsMask = lastBitMask - 1;
2212         z.low = 0;
2213         z.high = a.high;
2214         roundingMode = float_rounding_mode;
2215         if ( roundingMode == float_round_nearest_even ) {
2216             z.high += lastBitMask>>1;
2217             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
2218                 z.high &= ~ lastBitMask;
2219             }
2220         }
2221         else if ( roundingMode != float_round_to_zero ) {
2222             if (   extractFloat128Sign( z )
2223                  ^ ( roundingMode == float_round_up ) ) {
2224                 z.high |= ( a.low != 0 );
2225                 z.high += roundBitsMask;
2226             }
2227         }
2228         z.high &= ~ roundBitsMask;
2229     }
2230     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
2231         float_exception_flags |= float_flag_inexact;
2232     }
2233     return z;
2234
2235 }
2236
2237 /*----------------------------------------------------------------------------
2238 | Returns the result of adding the absolute values of the quadruple-precision
2239 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2240 | before being returned.  `zSign' is ignored if the result is a NaN.
2241 | The addition is performed according to the IEC/IEEE Standard for Binary
2242 | Floating-Point Arithmetic.
2243 *----------------------------------------------------------------------------*/
2244
2245 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
2246 {
2247     int32 aExp, bExp, zExp;
2248     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
2249     int32 expDiff;
2250
2251     aSig1 = extractFloat128Frac1( a );
2252     aSig0 = extractFloat128Frac0( a );
2253     aExp = extractFloat128Exp( a );
2254     bSig1 = extractFloat128Frac1( b );
2255     bSig0 = extractFloat128Frac0( b );
2256     bExp = extractFloat128Exp( b );
2257     expDiff = aExp - bExp;
2258     if ( 0 < expDiff ) {
2259         if ( aExp == 0x7FFF ) {
2260             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
2261             return a;
2262         }
2263         if ( bExp == 0 ) {
2264             --expDiff;
2265         }
2266         else {
2267             bSig0 |= LIT64( 0x0001000000000000 );
2268         }
2269         shift128ExtraRightJamming(
2270             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
2271         zExp = aExp;
2272     }
2273     else if ( expDiff < 0 ) {
2274         if ( bExp == 0x7FFF ) {
2275             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2276             return packFloat128( zSign, 0x7FFF, 0, 0 );
2277         }
2278         if ( aExp == 0 ) {
2279             ++expDiff;
2280         }
2281         else {
2282             aSig0 |= LIT64( 0x0001000000000000 );
2283         }
2284         shift128ExtraRightJamming(
2285             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
2286         zExp = bExp;
2287     }
2288     else {
2289         if ( aExp == 0x7FFF ) {
2290             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
2291                 return propagateFloat128NaN( a, b );
2292             }
2293             return a;
2294         }
2295         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
2296         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
2297         zSig2 = 0;
2298         zSig0 |= LIT64( 0x0002000000000000 );
2299         zExp = aExp;
2300         goto shiftRight1;
2301     }
2302     aSig0 |= LIT64( 0x0001000000000000 );
2303     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
2304     --zExp;
2305     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
2306     ++zExp;
2307  shiftRight1:
2308     shift128ExtraRightJamming(
2309         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
2310  roundAndPack:
2311     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2312
2313 }
2314
2315 /*----------------------------------------------------------------------------
2316 | Returns the result of subtracting the absolute values of the quadruple-
2317 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2318 | difference is negated before being returned.  `zSign' is ignored if the
2319 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2320 | Standard for Binary Floating-Point Arithmetic.
2321 *----------------------------------------------------------------------------*/
2322
2323 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
2324 {
2325     int32 aExp, bExp, zExp;
2326     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
2327     int32 expDiff;
2328     float128 z;
2329
2330     aSig1 = extractFloat128Frac1( a );
2331     aSig0 = extractFloat128Frac0( a );
2332     aExp = extractFloat128Exp( a );
2333     bSig1 = extractFloat128Frac1( b );
2334     bSig0 = extractFloat128Frac0( b );
2335     bExp = extractFloat128Exp( b );
2336     expDiff = aExp - bExp;
2337     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
2338     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
2339     if ( 0 < expDiff ) goto aExpBigger;
2340     if ( expDiff < 0 ) goto bExpBigger;
2341     if ( aExp == 0x7FFF ) {
2342         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
2343             return propagateFloat128NaN( a, b );
2344         }
2345         float_raise( float_flag_invalid );
2346         z.low = float128_default_nan_low;
2347         z.high = float128_default_nan_high;
2348         return z;
2349     }
2350     if ( aExp == 0 ) {
2351         aExp = 1;
2352         bExp = 1;
2353     }
2354     if ( bSig0 < aSig0 ) goto aBigger;
2355     if ( aSig0 < bSig0 ) goto bBigger;
2356     if ( bSig1 < aSig1 ) goto aBigger;
2357     if ( aSig1 < bSig1 ) goto bBigger;
2358     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
2359  bExpBigger:
2360     if ( bExp == 0x7FFF ) {
2361         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2362         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
2363     }
2364     if ( aExp == 0 ) {
2365         ++expDiff;
2366     }
2367     else {
2368         aSig0 |= LIT64( 0x4000000000000000 );
2369     }
2370     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2371     bSig0 |= LIT64( 0x4000000000000000 );
2372  bBigger:
2373     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
2374     zExp = bExp;
2375     zSign ^= 1;
2376     goto normalizeRoundAndPack;
2377  aExpBigger:
2378     if ( aExp == 0x7FFF ) {
2379         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
2380         return a;
2381     }
2382     if ( bExp == 0 ) {
2383         --expDiff;
2384     }
2385     else {
2386         bSig0 |= LIT64( 0x4000000000000000 );
2387     }
2388     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
2389     aSig0 |= LIT64( 0x4000000000000000 );
2390  aBigger:
2391     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
2392     zExp = aExp;
2393  normalizeRoundAndPack:
2394     --zExp;
2395     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
2396
2397 }
2398
2399 /*----------------------------------------------------------------------------
2400 | Returns the result of adding the quadruple-precision floating-point values
2401 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2402 | for Binary Floating-Point Arithmetic.
2403 *----------------------------------------------------------------------------*/
2404
2405 float128 float128_add( float128 a, float128 b )
2406 {
2407     flag aSign, bSign;
2408
2409     aSign = extractFloat128Sign( a );
2410     bSign = extractFloat128Sign( b );
2411     if ( aSign == bSign ) {
2412         return addFloat128Sigs( a, b, aSign );
2413     }
2414     else {
2415         return subFloat128Sigs( a, b, aSign );
2416     }
2417
2418 }
2419
2420 /*----------------------------------------------------------------------------
2421 | Returns the result of subtracting the quadruple-precision floating-point
2422 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
2423 | Standard for Binary Floating-Point Arithmetic.
2424 *----------------------------------------------------------------------------*/
2425
2426 float128 float128_sub( float128 a, float128 b )
2427 {
2428     flag aSign, bSign;
2429
2430     aSign = extractFloat128Sign( a );
2431     bSign = extractFloat128Sign( b );
2432     if ( aSign == bSign ) {
2433         return subFloat128Sigs( a, b, aSign );
2434     }
2435     else {
2436         return addFloat128Sigs( a, b, aSign );
2437     }
2438
2439 }
2440
2441 /*----------------------------------------------------------------------------
2442 | Returns the result of multiplying the quadruple-precision floating-point
2443 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
2444 | Standard for Binary Floating-Point Arithmetic.
2445 *----------------------------------------------------------------------------*/
2446
2447 float128 float128_mul( float128 a, float128 b )
2448 {
2449     flag aSign, bSign, zSign;
2450     int32 aExp, bExp, zExp;
2451     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
2452     float128 z;
2453
2454     aSig1 = extractFloat128Frac1( a );
2455     aSig0 = extractFloat128Frac0( a );
2456     aExp = extractFloat128Exp( a );
2457     aSign = extractFloat128Sign( a );
2458     bSig1 = extractFloat128Frac1( b );
2459     bSig0 = extractFloat128Frac0( b );
2460     bExp = extractFloat128Exp( b );
2461     bSign = extractFloat128Sign( b );
2462     zSign = aSign ^ bSign;
2463     if ( aExp == 0x7FFF ) {
2464         if (    ( aSig0 | aSig1 )
2465              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
2466             return propagateFloat128NaN( a, b );
2467         }
2468         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
2469         return packFloat128( zSign, 0x7FFF, 0, 0 );
2470     }
2471     if ( bExp == 0x7FFF ) {
2472         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2473         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
2474  invalid:
2475             float_raise( float_flag_invalid );
2476             z.low = float128_default_nan_low;
2477             z.high = float128_default_nan_high;
2478             return z;
2479         }
2480         return packFloat128( zSign, 0x7FFF, 0, 0 );
2481     }
2482     if ( aExp == 0 ) {
2483         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
2484         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2485     }
2486     if ( bExp == 0 ) {
2487         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
2488         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2489     }
2490     zExp = aExp + bExp - 0x4000;
2491     aSig0 |= LIT64( 0x0001000000000000 );
2492     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
2493     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
2494     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
2495     zSig2 |= ( zSig3 != 0 );
2496     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
2497         shift128ExtraRightJamming(
2498             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
2499         ++zExp;
2500     }
2501     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2502
2503 }
2504
2505 /*----------------------------------------------------------------------------
2506 | Returns the result of dividing the quadruple-precision floating-point value
2507 | `a' by the corresponding value `b'.  The operation is performed according to
2508 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2509 *----------------------------------------------------------------------------*/
2510
2511 float128 float128_div( float128 a, float128 b )
2512 {
2513     flag aSign, bSign, zSign;
2514     int32 aExp, bExp, zExp;
2515     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
2516     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2517     float128 z;
2518
2519     aSig1 = extractFloat128Frac1( a );
2520     aSig0 = extractFloat128Frac0( a );
2521     aExp = extractFloat128Exp( a );
2522     aSign = extractFloat128Sign( a );
2523     bSig1 = extractFloat128Frac1( b );
2524     bSig0 = extractFloat128Frac0( b );
2525     bExp = extractFloat128Exp( b );
2526     bSign = extractFloat128Sign( b );
2527     zSign = aSign ^ bSign;
2528     if ( aExp == 0x7FFF ) {
2529         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
2530         if ( bExp == 0x7FFF ) {
2531             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2532             goto invalid;
2533         }
2534         return packFloat128( zSign, 0x7FFF, 0, 0 );
2535     }
2536     if ( bExp == 0x7FFF ) {
2537         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2538         return packFloat128( zSign, 0, 0, 0 );
2539     }
2540     if ( bExp == 0 ) {
2541         if ( ( bSig0 | bSig1 ) == 0 ) {
2542             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
2543  invalid:
2544                 float_raise( float_flag_invalid );
2545                 z.low = float128_default_nan_low;
2546                 z.high = float128_default_nan_high;
2547                 return z;
2548             }
2549             float_raise( float_flag_divbyzero );
2550             return packFloat128( zSign, 0x7FFF, 0, 0 );
2551         }
2552         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2553     }
2554     if ( aExp == 0 ) {
2555         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
2556         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2557     }
2558     zExp = aExp - bExp + 0x3FFD;
2559     shortShift128Left(
2560         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
2561     shortShift128Left(
2562         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
2563     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
2564         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
2565         ++zExp;
2566     }
2567     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
2568     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
2569     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
2570     while ( (sbits64) rem0 < 0 ) {
2571         --zSig0;
2572         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
2573     }
2574     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
2575     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
2576         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
2577         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
2578         while ( (sbits64) rem1 < 0 ) {
2579             --zSig1;
2580             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
2581         }
2582         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2583     }
2584     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
2585     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
2586
2587 }
2588
2589 /*----------------------------------------------------------------------------
2590 | Returns the remainder of the quadruple-precision floating-point value `a'
2591 | with respect to the corresponding value `b'.  The operation is performed
2592 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2593 *----------------------------------------------------------------------------*/
2594
2595 float128 float128_rem( float128 a, float128 b )
2596 {
2597     flag aSign, bSign, zSign;
2598     int32 aExp, bExp, expDiff;
2599     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
2600     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
2601     sbits64 sigMean0;
2602     float128 z;
2603
2604     aSig1 = extractFloat128Frac1( a );
2605     aSig0 = extractFloat128Frac0( a );
2606     aExp = extractFloat128Exp( a );
2607     aSign = extractFloat128Sign( a );
2608     bSig1 = extractFloat128Frac1( b );
2609     bSig0 = extractFloat128Frac0( b );
2610     bExp = extractFloat128Exp( b );
2611     bSign = extractFloat128Sign( b );
2612     if ( aExp == 0x7FFF ) {
2613         if (    ( aSig0 | aSig1 )
2614              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
2615             return propagateFloat128NaN( a, b );
2616         }
2617         goto invalid;
2618     }
2619     if ( bExp == 0x7FFF ) {
2620         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
2621         return a;
2622     }
2623     if ( bExp == 0 ) {
2624         if ( ( bSig0 | bSig1 ) == 0 ) {
2625  invalid:
2626             float_raise( float_flag_invalid );
2627             z.low = float128_default_nan_low;
2628             z.high = float128_default_nan_high;
2629             return z;
2630         }
2631         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
2632     }
2633     if ( aExp == 0 ) {
2634         if ( ( aSig0 | aSig1 ) == 0 ) return a;
2635         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2636     }
2637     expDiff = aExp - bExp;
2638     if ( expDiff < -1 ) return a;
2639     shortShift128Left(
2640         aSig0 | LIT64( 0x0001000000000000 ),
2641         aSig1,
2642         15 - ( expDiff < 0 ),
2643         &aSig0,
2644         &aSig1
2645     );
2646     shortShift128Left(
2647         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
2648     q = le128( bSig0, bSig1, aSig0, aSig1 );
2649     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2650     expDiff -= 64;
2651     while ( 0 < expDiff ) {
2652         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
2653         q = ( 4 < q ) ? q - 4 : 0;
2654         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
2655         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
2656         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
2657         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
2658         expDiff -= 61;
2659     }
2660     if ( -64 < expDiff ) {
2661         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
2662         q = ( 4 < q ) ? q - 4 : 0;
2663         q >>= - expDiff;
2664         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
2665         expDiff += 52;
2666         if ( expDiff < 0 ) {
2667             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
2668         }
2669         else {
2670             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
2671         }
2672         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
2673         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
2674     }
2675     else {
2676         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
2677         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
2678     }
2679     do {
2680         alternateASig0 = aSig0;
2681         alternateASig1 = aSig1;
2682         ++q;
2683         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
2684     } while ( 0 <= (sbits64) aSig0 );
2685     add128(
2686         aSig0, aSig1, alternateASig0, alternateASig1, &sigMean0, &sigMean1 );
2687     if (    ( sigMean0 < 0 )
2688          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
2689         aSig0 = alternateASig0;
2690         aSig1 = alternateASig1;
2691     }
2692     zSign = ( (sbits64) aSig0 < 0 );
2693     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
2694     return
2695         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
2696
2697 }
2698
2699 /*----------------------------------------------------------------------------
2700 | Returns the square root of the quadruple-precision floating-point value `a'.
2701 | The operation is performed according to the IEC/IEEE Standard for Binary
2702 | Floating-Point Arithmetic.
2703 *----------------------------------------------------------------------------*/
2704
2705 float128 float128_sqrt( float128 a )
2706 {
2707     flag aSign;
2708     int32 aExp, zExp;
2709     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
2710     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
2711     float128 z;
2712
2713     aSig1 = extractFloat128Frac1( a );
2714     aSig0 = extractFloat128Frac0( a );
2715     aExp = extractFloat128Exp( a );
2716     aSign = extractFloat128Sign( a );
2717     if ( aExp == 0x7FFF ) {
2718         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
2719         if ( ! aSign ) return a;
2720         goto invalid;
2721     }
2722     if ( aSign ) {
2723         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
2724  invalid:
2725         float_raise( float_flag_invalid );
2726         z.low = float128_default_nan_low;
2727         z.high = float128_default_nan_high;
2728         return z;
2729     }
2730     if ( aExp == 0 ) {
2731         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
2732         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
2733     }
2734     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
2735     aSig0 |= LIT64( 0x0001000000000000 );
2736     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
2737     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
2738     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
2739     doubleZSig0 = zSig0<<1;
2740     mul64To128( zSig0, zSig0, &term0, &term1 );
2741     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
2742     while ( (sbits64) rem0 < 0 ) {
2743         --zSig0;
2744         doubleZSig0 -= 2;
2745         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
2746     }
2747     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
2748     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
2749         if ( zSig1 == 0 ) zSig1 = 1;
2750         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
2751         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
2752         mul64To128( zSig1, zSig1, &term2, &term3 );
2753         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
2754         while ( (sbits64) rem1 < 0 ) {
2755             --zSig1;
2756             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
2757             term3 |= 1;
2758             term2 |= doubleZSig0;
2759             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
2760         }
2761         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
2762     }
2763     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
2764     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
2765
2766 }
2767
2768 /*----------------------------------------------------------------------------
2769 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
2770 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2771 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2772 *----------------------------------------------------------------------------*/
2773
2774 flag float128_eq( float128 a, float128 b )
2775 {
2776
2777     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
2778               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2779          || (    ( extractFloat128Exp( b ) == 0x7FFF )
2780               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2781        ) {
2782         if (    float128_is_signaling_nan( a )
2783              || float128_is_signaling_nan( b ) ) {
2784             float_raise( float_flag_invalid );
2785         }
2786         return 0;
2787     }
2788     return
2789            ( a.low == b.low )
2790         && (    ( a.high == b.high )
2791              || (    ( a.low == 0 )
2792                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
2793            );
2794
2795 }
2796
2797 /*----------------------------------------------------------------------------
2798 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2799 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2800 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2801 | Arithmetic.
2802 *----------------------------------------------------------------------------*/
2803
2804 flag float128_le( float128 a, float128 b )
2805 {
2806     flag aSign, bSign;
2807
2808     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
2809               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2810          || (    ( extractFloat128Exp( b ) == 0x7FFF )
2811               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2812        ) {
2813         float_raise( float_flag_invalid );
2814         return 0;
2815     }
2816     aSign = extractFloat128Sign( a );
2817     bSign = extractFloat128Sign( b );
2818     if ( aSign != bSign ) {
2819         return
2820                aSign
2821             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2822                  == 0 );
2823     }
2824     return
2825           aSign ? le128( b.high, b.low, a.high, a.low )
2826         : le128( a.high, a.low, b.high, b.low );
2827
2828 }
2829
2830 /*----------------------------------------------------------------------------
2831 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2832 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2833 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2834 *----------------------------------------------------------------------------*/
2835
2836 flag float128_lt( float128 a, float128 b )
2837 {
2838     flag aSign, bSign;
2839
2840     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
2841               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2842          || (    ( extractFloat128Exp( b ) == 0x7FFF )
2843               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2844        ) {
2845         float_raise( float_flag_invalid );
2846         return 0;
2847     }
2848     aSign = extractFloat128Sign( a );
2849     bSign = extractFloat128Sign( b );
2850     if ( aSign != bSign ) {
2851         return
2852                aSign
2853             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2854                  != 0 );
2855     }
2856     return
2857           aSign ? lt128( b.high, b.low, a.high, a.low )
2858         : lt128( a.high, a.low, b.high, b.low );
2859
2860 }
2861
2862 /*----------------------------------------------------------------------------
2863 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
2864 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2865 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2866 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2867 *----------------------------------------------------------------------------*/
2868
2869 flag float128_eq_signaling( float128 a, float128 b )
2870 {
2871
2872     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
2873               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2874          || (    ( extractFloat128Exp( b ) == 0x7FFF )
2875               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2876        ) {
2877         float_raise( float_flag_invalid );
2878         return 0;
2879     }
2880     return
2881            ( a.low == b.low )
2882         && (    ( a.high == b.high )
2883              || (    ( a.low == 0 )
2884                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
2885            );
2886
2887 }
2888
2889 /*----------------------------------------------------------------------------
2890 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2891 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2892 | cause an exception.  Otherwise, the comparison is performed according to the
2893 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2894 *----------------------------------------------------------------------------*/
2895
2896 flag float128_le_quiet( float128 a, float128 b )
2897 {
2898     flag aSign, bSign;
2899
2900     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
2901               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2902          || (    ( extractFloat128Exp( b ) == 0x7FFF )
2903               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2904        ) {
2905         if (    float128_is_signaling_nan( a )
2906              || float128_is_signaling_nan( b ) ) {
2907             float_raise( float_flag_invalid );
2908         }
2909         return 0;
2910     }
2911     aSign = extractFloat128Sign( a );
2912     bSign = extractFloat128Sign( b );
2913     if ( aSign != bSign ) {
2914         return
2915                aSign
2916             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2917                  == 0 );
2918     }
2919     return
2920           aSign ? le128( b.high, b.low, a.high, a.low )
2921         : le128( a.high, a.low, b.high, b.low );
2922
2923 }
2924
2925 /*----------------------------------------------------------------------------
2926 | Returns 1 if the quadruple-precision floating-point value `a' is less than
2927 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2928 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2929 | Standard for Binary Floating-Point Arithmetic.
2930 *----------------------------------------------------------------------------*/
2931
2932 flag float128_lt_quiet( float128 a, float128 b )
2933 {
2934     flag aSign, bSign;
2935
2936     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
2937               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
2938          || (    ( extractFloat128Exp( b ) == 0x7FFF )
2939               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
2940        ) {
2941         if (    float128_is_signaling_nan( a )
2942              || float128_is_signaling_nan( b ) ) {
2943             float_raise( float_flag_invalid );
2944         }
2945         return 0;
2946     }
2947     aSign = extractFloat128Sign( a );
2948     bSign = extractFloat128Sign( b );
2949     if ( aSign != bSign ) {
2950         return
2951                aSign
2952             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
2953                  != 0 );
2954     }
2955     return
2956           aSign ? lt128( b.high, b.low, a.high, a.low )
2957         : lt128( a.high, a.low, b.high, b.low );
2958
2959 }
2960
2961 #endif
2962