softfloat/softfloat.c

   1
   2 /*============================================================================
   3
   4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
   5 Package, Release 2b.
   6
   7 Written by John R. Hauser.  This work was made possible in part by the
   8 International Computer Science Institute, located at Suite 600, 1947 Center
   9 Street, Berkeley, California 94704.  Funding was partially provided by the
  10 National Science Foundation under grant MIP-9311980.  The original version
  11 of this code was written as part of a project to build a fixed-point vector
  12 processor in collaboration with the University of California at Berkeley,
  13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
  14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
  15 arithmetic/SoftFloat.html'.
  16
  17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
  18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
  19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
  20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
  21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
  22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
  23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
  24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
  25
  26 Derivative works are acceptable, even for commercial purposes, so long as
  27 (1) the source code for the derivative work includes prominent notice that
  28 the work is derivative, and (2) the source code includes prominent notice with
  29 these four paragraphs for those parts of this code that are retained.
  30
  31 =============================================================================*/
  32
  33 #include "milieu.h"
  34 #include "softfloat.h"
  35
  36 /*----------------------------------------------------------------------------
  37 | Floating-point rounding mode, extended double-precision rounding precision,
  38 | and exception flags.
  39 *----------------------------------------------------------------------------*/
  40 int8 float_rounding_mode = float_round_nearest_even;
  41 int8 float_exception_flags = 0;
  42 #ifdef FLOATX80
  43 int8 floatx80_rounding_precision = 80;
  44 #endif
  45
  46 /*----------------------------------------------------------------------------
  47 | Primitive arithmetic functions, including multi-word arithmetic, and
  48 | division and square root approximations.  (Can be specialized to target if
  49 | desired.)
  50 *----------------------------------------------------------------------------*/
  51 #include "softfloat-macros"
  52
  53 /*----------------------------------------------------------------------------
  54 | Functions and definitions to determine:  (1) whether tininess for underflow
  55 | is detected before or after rounding by default, (2) what (if anything)
  56 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
  57 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
  58 | are propagated from function inputs to output.  These details are target-
  59 | specific.
  60 *----------------------------------------------------------------------------*/
  61 #include "softfloat-specialize"
  62
  63 /*----------------------------------------------------------------------------
  64 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
  65 | and 7, and returns the properly rounded 32-bit integer corresponding to the
  66 | input.  If `zSign' is 1, the input is negated before being converted to an
  67 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
  68 | is simply rounded to an integer, with the inexact exception raised if the
  69 | input cannot be represented exactly as an integer.  However, if the fixed-
  70 | point input is too large, the invalid exception is raised and the largest
  71 | positive or negative integer is returned.
  72 *----------------------------------------------------------------------------*/
  73
  74 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
  75 {
  76     int8 roundingMode;
  77     flag roundNearestEven;
  78     int8 roundIncrement, roundBits;
  79     int32 z;
  80
  81     roundingMode = float_rounding_mode;
  82     roundNearestEven = ( roundingMode == float_round_nearest_even );
  83     roundIncrement = 0x40;
  84     if ( ! roundNearestEven ) {
  85         if ( roundingMode == float_round_to_zero ) {
  86             roundIncrement = 0;
  87         }
  88         else {
  89             roundIncrement = 0x7F;
  90             if ( zSign ) {
  91                 if ( roundingMode == float_round_up ) roundIncrement = 0;
  92             }
  93             else {
  94                 if ( roundingMode == float_round_down ) roundIncrement = 0;
  95             }
  96         }
  97     }
  98     roundBits = absZ & 0x7F;
  99     absZ = ( absZ + roundIncrement )>>7;
 100     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 101     z = absZ;
 102     if ( zSign ) z = - z;
 103     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
 104         float_raise( float_flag_invalid );
 105         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
 106     }
 107     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 108     return z;
 109
 110 }
 111
 112 /*----------------------------------------------------------------------------
 113 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
 114 | `absZ1', with binary point between bits 63 and 64 (between the input words),
 115 | and returns the properly rounded 64-bit integer corresponding to the input.
 116 | If `zSign' is 1, the input is negated before being converted to an integer.
 117 | Ordinarily, the fixed-point input is simply rounded to an integer, with
 118 | the inexact exception raised if the input cannot be represented exactly as
 119 | an integer.  However, if the fixed-point input is too large, the invalid
 120 | exception is raised and the largest positive or negative integer is
 121 | returned.
 122 *----------------------------------------------------------------------------*/
 123
 124 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
 125 {
 126     int8 roundingMode;
 127     flag roundNearestEven, increment;
 128     int64 z;
 129
 130     roundingMode = float_rounding_mode;
 131     roundNearestEven = ( roundingMode == float_round_nearest_even );
 132     increment = ( (sbits64) absZ1 < 0 );
 133     if ( ! roundNearestEven ) {
 134         if ( roundingMode == float_round_to_zero ) {
 135             increment = 0;
 136         }
 137         else {
 138             if ( zSign ) {
 139                 increment = ( roundingMode == float_round_down ) && absZ1;
 140             }
 141             else {
 142                 increment = ( roundingMode == float_round_up ) && absZ1;
 143             }
 144         }
 145     }
 146     if ( increment ) {
 147         ++absZ0;
 148         if ( absZ0 == 0 ) goto overflow;
 149         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
 150     }
 151     z = absZ0;
 152     if ( zSign ) z = - z;
 153     if ( z && ( ( z < 0 ) ^ zSign ) ) {
 154  overflow:
 155         float_raise( float_flag_invalid );
 156         return
 157               zSign ? (sbits64) LIT64( 0x8000000000000000 )
 158             : LIT64( 0x7FFFFFFFFFFFFFFF );
 159     }
 160     if ( absZ1 ) float_exception_flags |= float_flag_inexact;
 161     return z;
 162
 163 }
 164
 165 /*----------------------------------------------------------------------------
 166 | Returns the fraction bits of the single-precision floating-point value `a'.
 167 *----------------------------------------------------------------------------*/
 168
 169 INLINE bits32 extractFloat32Frac( float32 a )
 170 {
 171
 172     return a & 0x007FFFFF;
 173
 174 }
 175
 176 /*----------------------------------------------------------------------------
 177 | Returns the exponent bits of the single-precision floating-point value `a'.
 178 *----------------------------------------------------------------------------*/
 179
 180 INLINE int16 extractFloat32Exp( float32 a )
 181 {
 182
 183     return ( a>>23 ) & 0xFF;
 184
 185 }
 186
 187 /*----------------------------------------------------------------------------
 188 | Returns the sign bit of the single-precision floating-point value `a'.
 189 *----------------------------------------------------------------------------*/
 190
 191 INLINE flag extractFloat32Sign( float32 a )
 192 {
 193
 194     return a>>31;
 195
 196 }
 197
 198 /*----------------------------------------------------------------------------
 199 | Normalizes the subnormal single-precision floating-point value represented
 200 | by the denormalized significand `aSig'.  The normalized exponent and
 201 | significand are stored at the locations pointed to by `zExpPtr' and
 202 | `zSigPtr', respectively.
 203 *----------------------------------------------------------------------------*/
 204
 205 static void
 206  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
 207 {
 208     int8 shiftCount;
 209
 210     shiftCount = countLeadingZeros32( aSig ) - 8;
 211     *zSigPtr = aSig<<shiftCount;
 212     *zExpPtr = 1 - shiftCount;
 213
 214 }
 215
 216 /*----------------------------------------------------------------------------
 217 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 218 | single-precision floating-point value, returning the result.  After being
 219 | shifted into the proper positions, the three fields are simply added
 220 | together to form the result.  This means that any integer portion of `zSig'
 221 | will be added into the exponent.  Since a properly normalized significand
 222 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 223 | than the desired result exponent whenever `zSig' is a complete, normalized
 224 | significand.
 225 *----------------------------------------------------------------------------*/
 226
 227 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
 228 {
 229
 230     return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
 231
 232 }
 233
 234 /*----------------------------------------------------------------------------
 235 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 236 | and significand `zSig', and returns the proper single-precision floating-
 237 | point value corresponding to the abstract input.  Ordinarily, the abstract
 238 | value is simply rounded and packed into the single-precision format, with
 239 | the inexact exception raised if the abstract input cannot be represented
 240 | exactly.  However, if the abstract value is too large, the overflow and
 241 | inexact exceptions are raised and an infinity or maximal finite value is
 242 | returned.  If the abstract value is too small, the input value is rounded to
 243 | a subnormal number, and the underflow and inexact exceptions are raised if
 244 | the abstract input cannot be represented exactly as a subnormal single-
 245 | precision floating-point number.
 246 |     The input significand `zSig' has its binary point between bits 30
 247 | and 29, which is 7 bits to the left of the usual location.  This shifted
 248 | significand must be normalized or smaller.  If `zSig' is not normalized,
 249 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 250 | and it must not require rounding.  In the usual case that `zSig' is
 251 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 252 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 253 | Binary Floating-Point Arithmetic.
 254 *----------------------------------------------------------------------------*/
 255
 256 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 257 {
 258     int8 roundingMode;
 259     flag roundNearestEven;
 260     int8 roundIncrement, roundBits;
 261     flag isTiny;
 262
 263     roundingMode = float_rounding_mode;
 264     roundNearestEven = ( roundingMode == float_round_nearest_even );
 265     roundIncrement = 0x40;
 266     if ( ! roundNearestEven ) {
 267         if ( roundingMode == float_round_to_zero ) {
 268             roundIncrement = 0;
 269         }
 270         else {
 271             roundIncrement = 0x7F;
 272             if ( zSign ) {
 273                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 274             }
 275             else {
 276                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 277             }
 278         }
 279     }
 280     roundBits = zSig & 0x7F;
 281     if ( 0xFD <= (bits16) zExp ) {
 282         if (    ( 0xFD < zExp )
 283              || (    ( zExp == 0xFD )
 284                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
 285            ) {
 286             float_raise( float_flag_overflow | float_flag_inexact );
 287             return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
 288         }
 289         if ( zExp < 0 ) {
 290             isTiny =
 291                    ( float_detect_tininess == float_tininess_before_rounding )
 292                 || ( zExp < -1 )
 293                 || ( zSig + roundIncrement < 0x80000000 );
 294             shift32RightJamming( zSig, - zExp, &zSig );
 295             zExp = 0;
 296             roundBits = zSig & 0x7F;
 297             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 298         }
 299     }
 300     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 301     zSig = ( zSig + roundIncrement )>>7;
 302     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
 303     if ( zSig == 0 ) zExp = 0;
 304     return packFloat32( zSign, zExp, zSig );
 305
 306 }
 307
 308 /*----------------------------------------------------------------------------
 309 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 310 | and significand `zSig', and returns the proper single-precision floating-
 311 | point value corresponding to the abstract input.  This routine is just like
 312 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
 313 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 314 | floating-point exponent.
 315 *----------------------------------------------------------------------------*/
 316
 317 static float32
 318  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
 319 {
 320     int8 shiftCount;
 321
 322     shiftCount = countLeadingZeros32( zSig ) - 1;
 323     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
 324
 325 }
 326
 327 /*----------------------------------------------------------------------------
 328 | Returns the fraction bits of the double-precision floating-point value `a'.
 329 *----------------------------------------------------------------------------*/
 330
 331 INLINE bits64 extractFloat64Frac( float64 a )
 332 {
 333
 334     return a & LIT64( 0x000FFFFFFFFFFFFF );
 335
 336 }
 337
 338 /*----------------------------------------------------------------------------
 339 | Returns the exponent bits of the double-precision floating-point value `a'.
 340 *----------------------------------------------------------------------------*/
 341
 342 INLINE int16 extractFloat64Exp( float64 a )
 343 {
 344
 345     return ( a>>52 ) & 0x7FF;
 346
 347 }
 348
 349 /*----------------------------------------------------------------------------
 350 | Returns the sign bit of the double-precision floating-point value `a'.
 351 *----------------------------------------------------------------------------*/
 352
 353 INLINE flag extractFloat64Sign( float64 a )
 354 {
 355
 356     return a>>63;
 357
 358 }
 359
 360 /*----------------------------------------------------------------------------
 361 | Normalizes the subnormal double-precision floating-point value represented
 362 | by the denormalized significand `aSig'.  The normalized exponent and
 363 | significand are stored at the locations pointed to by `zExpPtr' and
 364 | `zSigPtr', respectively.
 365 *----------------------------------------------------------------------------*/
 366
 367 static void
 368  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
 369 {
 370     int8 shiftCount;
 371
 372     shiftCount = countLeadingZeros64( aSig ) - 11;
 373     *zSigPtr = aSig<<shiftCount;
 374     *zExpPtr = 1 - shiftCount;
 375
 376 }
 377
 378 /*----------------------------------------------------------------------------
 379 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
 380 | double-precision floating-point value, returning the result.  After being
 381 | shifted into the proper positions, the three fields are simply added
 382 | together to form the result.  This means that any integer portion of `zSig'
 383 | will be added into the exponent.  Since a properly normalized significand
 384 | will have an integer portion equal to 1, the `zExp' input should be 1 less
 385 | than the desired result exponent whenever `zSig' is a complete, normalized
 386 | significand.
 387 *----------------------------------------------------------------------------*/
 388
 389 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
 390 {
 391
 392     return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
 393
 394 }
 395
 396 /*----------------------------------------------------------------------------
 397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 398 | and significand `zSig', and returns the proper double-precision floating-
 399 | point value corresponding to the abstract input.  Ordinarily, the abstract
 400 | value is simply rounded and packed into the double-precision format, with
 401 | the inexact exception raised if the abstract input cannot be represented
 402 | exactly.  However, if the abstract value is too large, the overflow and
 403 | inexact exceptions are raised and an infinity or maximal finite value is
 404 | returned.  If the abstract value is too small, the input value is rounded
 405 | to a subnormal number, and the underflow and inexact exceptions are raised
 406 | if the abstract input cannot be represented exactly as a subnormal double-
 407 | precision floating-point number.
 408 |     The input significand `zSig' has its binary point between bits 62
 409 | and 61, which is 10 bits to the left of the usual location.  This shifted
 410 | significand must be normalized or smaller.  If `zSig' is not normalized,
 411 | `zExp' must be 0; in that case, the result returned is a subnormal number,
 412 | and it must not require rounding.  In the usual case that `zSig' is
 413 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
 414 | The handling of underflow and overflow follows the IEC/IEEE Standard for
 415 | Binary Floating-Point Arithmetic.
 416 *----------------------------------------------------------------------------*/
 417
 418 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 419 {
 420     int8 roundingMode;
 421     flag roundNearestEven;
 422     int16 roundIncrement, roundBits;
 423     flag isTiny;
 424
 425     roundingMode = float_rounding_mode;
 426     roundNearestEven = ( roundingMode == float_round_nearest_even );
 427     roundIncrement = 0x200;
 428     if ( ! roundNearestEven ) {
 429         if ( roundingMode == float_round_to_zero ) {
 430             roundIncrement = 0;
 431         }
 432         else {
 433             roundIncrement = 0x3FF;
 434             if ( zSign ) {
 435                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 436             }
 437             else {
 438                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 439             }
 440         }
 441     }
 442     roundBits = zSig & 0x3FF;
 443     if ( 0x7FD <= (bits16) zExp ) {
 444         if (    ( 0x7FD < zExp )
 445              || (    ( zExp == 0x7FD )
 446                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
 447            ) {
 448             float_raise( float_flag_overflow | float_flag_inexact );
 449             return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
 450         }
 451         if ( zExp < 0 ) {
 452             isTiny =
 453                    ( float_detect_tininess == float_tininess_before_rounding )
 454                 || ( zExp < -1 )
 455                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
 456             shift64RightJamming( zSig, - zExp, &zSig );
 457             zExp = 0;
 458             roundBits = zSig & 0x3FF;
 459             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 460         }
 461     }
 462     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 463     zSig = ( zSig + roundIncrement )>>10;
 464     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
 465     if ( zSig == 0 ) zExp = 0;
 466     return packFloat64( zSign, zExp, zSig );
 467
 468 }
 469
 470 /*----------------------------------------------------------------------------
 471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 472 | and significand `zSig', and returns the proper double-precision floating-
 473 | point value corresponding to the abstract input.  This routine is just like
 474 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
 475 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
 476 | floating-point exponent.
 477 *----------------------------------------------------------------------------*/
 478
 479 static float64
 480  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
 481 {
 482     int8 shiftCount;
 483
 484     shiftCount = countLeadingZeros64( zSig ) - 1;
 485     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
 486
 487 }
 488
 489 #ifdef FLOATX80
 490
 491 /*----------------------------------------------------------------------------
 492 | Returns the fraction bits of the extended double-precision floating-point
 493 | value `a'.
 494 *----------------------------------------------------------------------------*/
 495
 496 INLINE bits64 extractFloatx80Frac( floatx80 a )
 497 {
 498
 499     return a.low;
 500
 501 }
 502
 503 /*----------------------------------------------------------------------------
 504 | Returns the exponent bits of the extended double-precision floating-point
 505 | value `a'.
 506 *----------------------------------------------------------------------------*/
 507
 508 INLINE int32 extractFloatx80Exp( floatx80 a )
 509 {
 510
 511     return a.high & 0x7FFF;
 512
 513 }
 514
 515 /*----------------------------------------------------------------------------
 516 | Returns the sign bit of the extended double-precision floating-point value
 517 | `a'.
 518 *----------------------------------------------------------------------------*/
 519
 520 INLINE flag extractFloatx80Sign( floatx80 a )
 521 {
 522
 523     return a.high>>15;
 524
 525 }
 526
 527 /*----------------------------------------------------------------------------
 528 | Normalizes the subnormal extended double-precision floating-point value
 529 | represented by the denormalized significand `aSig'.  The normalized exponent
 530 | and significand are stored at the locations pointed to by `zExpPtr' and
 531 | `zSigPtr', respectively.
 532 *----------------------------------------------------------------------------*/
 533
 534 static void
 535  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
 536 {
 537     int8 shiftCount;
 538
 539     shiftCount = countLeadingZeros64( aSig );
 540     *zSigPtr = aSig<<shiftCount;
 541     *zExpPtr = 1 - shiftCount;
 542
 543 }
 544
 545 /*----------------------------------------------------------------------------
 546 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
 547 | extended double-precision floating-point value, returning the result.
 548 *----------------------------------------------------------------------------*/
 549
 550 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
 551 {
 552     floatx80 z;
 553
 554     z.low = zSig;
 555     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
 556     return z;
 557
 558 }
 559
 560 /*----------------------------------------------------------------------------
 561 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 562 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
 563 | and returns the proper extended double-precision floating-point value
 564 | corresponding to the abstract input.  Ordinarily, the abstract value is
 565 | rounded and packed into the extended double-precision format, with the
 566 | inexact exception raised if the abstract input cannot be represented
 567 | exactly.  However, if the abstract value is too large, the overflow and
 568 | inexact exceptions are raised and an infinity or maximal finite value is
 569 | returned.  If the abstract value is too small, the input value is rounded to
 570 | a subnormal number, and the underflow and inexact exceptions are raised if
 571 | the abstract input cannot be represented exactly as a subnormal extended
 572 | double-precision floating-point number.
 573 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
 574 | number of bits as single or double precision, respectively.  Otherwise, the
 575 | result is rounded to the full precision of the extended double-precision
 576 | format.
 577 |     The input significand must be normalized or smaller.  If the input
 578 | significand is not normalized, `zExp' must be 0; in that case, the result
 579 | returned is a subnormal number, and it must not require rounding.  The
 580 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
 581 | Floating-Point Arithmetic.
 582 *----------------------------------------------------------------------------*/
 583
 584 static floatx80
 585  roundAndPackFloatx80(
 586      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 587  )
 588 {
 589     int8 roundingMode;
 590     flag roundNearestEven, increment, isTiny;
 591     int64 roundIncrement, roundMask, roundBits;
 592
 593     roundingMode = float_rounding_mode;
 594     roundNearestEven = ( roundingMode == float_round_nearest_even );
 595     if ( roundingPrecision == 80 ) goto precision80;
 596     if ( roundingPrecision == 64 ) {
 597         roundIncrement = LIT64( 0x0000000000000400 );
 598         roundMask = LIT64( 0x00000000000007FF );
 599     }
 600     else if ( roundingPrecision == 32 ) {
 601         roundIncrement = LIT64( 0x0000008000000000 );
 602         roundMask = LIT64( 0x000000FFFFFFFFFF );
 603     }
 604     else {
 605         goto precision80;
 606     }
 607     zSig0 |= ( zSig1 != 0 );
 608     if ( ! roundNearestEven ) {
 609         if ( roundingMode == float_round_to_zero ) {
 610             roundIncrement = 0;
 611         }
 612         else {
 613             roundIncrement = roundMask;
 614             if ( zSign ) {
 615                 if ( roundingMode == float_round_up ) roundIncrement = 0;
 616             }
 617             else {
 618                 if ( roundingMode == float_round_down ) roundIncrement = 0;
 619             }
 620         }
 621     }
 622     roundBits = zSig0 & roundMask;
 623     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 624         if (    ( 0x7FFE < zExp )
 625              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
 626            ) {
 627             goto overflow;
 628         }
 629         if ( zExp <= 0 ) {
 630             isTiny =
 631                    ( float_detect_tininess == float_tininess_before_rounding )
 632                 || ( zExp < 0 )
 633                 || ( zSig0 <= zSig0 + roundIncrement );
 634             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
 635             zExp = 0;
 636             roundBits = zSig0 & roundMask;
 637             if ( isTiny && roundBits ) float_raise( float_flag_underflow );
 638             if ( roundBits ) float_exception_flags |= float_flag_inexact;
 639             zSig0 += roundIncrement;
 640             if ( (sbits64) zSig0 < 0 ) zExp = 1;
 641             roundIncrement = roundMask + 1;
 642             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 643                 roundMask |= roundIncrement;
 644             }
 645             zSig0 &= ~ roundMask;
 646             return packFloatx80( zSign, zExp, zSig0 );
 647         }
 648     }
 649     if ( roundBits ) float_exception_flags |= float_flag_inexact;
 650     zSig0 += roundIncrement;
 651     if ( zSig0 < roundIncrement ) {
 652         ++zExp;
 653         zSig0 = LIT64( 0x8000000000000000 );
 654     }
 655     roundIncrement = roundMask + 1;
 656     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
 657         roundMask |= roundIncrement;
 658     }
 659     zSig0 &= ~ roundMask;
 660     if ( zSig0 == 0 ) zExp = 0;
 661     return packFloatx80( zSign, zExp, zSig0 );
 662  precision80:
 663     increment = ( (sbits64) zSig1 < 0 );
 664     if ( ! roundNearestEven ) {
 665         if ( roundingMode == float_round_to_zero ) {
 666             increment = 0;
 667         }
 668         else {
 669             if ( zSign ) {
 670                 increment = ( roundingMode == float_round_down ) && zSig1;
 671             }
 672             else {
 673                 increment = ( roundingMode == float_round_up ) && zSig1;
 674             }
 675         }
 676     }
 677     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
 678         if (    ( 0x7FFE < zExp )
 679              || (    ( zExp == 0x7FFE )
 680                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
 681                   && increment
 682                 )
 683            ) {
 684             roundMask = 0;
 685  overflow:
 686             float_raise( float_flag_overflow | float_flag_inexact );
 687             if (    ( roundingMode == float_round_to_zero )
 688                  || ( zSign && ( roundingMode == float_round_up ) )
 689                  || ( ! zSign && ( roundingMode == float_round_down ) )
 690                ) {
 691                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
 692             }
 693             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
 694         }
 695         if ( zExp <= 0 ) {
 696             isTiny =
 697                    ( float_detect_tininess == float_tininess_before_rounding )
 698                 || ( zExp < 0 )
 699                 || ! increment
 700                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
 701             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
 702             zExp = 0;
 703             if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
 704             if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 705             if ( roundNearestEven ) {
 706                 increment = ( (sbits64) zSig1 < 0 );
 707             }
 708             else {
 709                 if ( zSign ) {
 710                     increment = ( roundingMode == float_round_down ) && zSig1;
 711                 }
 712                 else {
 713                     increment = ( roundingMode == float_round_up ) && zSig1;
 714                 }
 715             }
 716             if ( increment ) {
 717                 ++zSig0;
 718                 zSig0 &=
 719                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 720                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
 721             }
 722             return packFloatx80( zSign, zExp, zSig0 );
 723         }
 724     }
 725     if ( zSig1 ) float_exception_flags |= float_flag_inexact;
 726     if ( increment ) {
 727         ++zSig0;
 728         if ( zSig0 == 0 ) {
 729             ++zExp;
 730             zSig0 = LIT64( 0x8000000000000000 );
 731         }
 732         else {
 733             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
 734         }
 735     }
 736     else {
 737         if ( zSig0 == 0 ) zExp = 0;
 738     }
 739     return packFloatx80( zSign, zExp, zSig0 );
 740
 741 }
 742
 743 /*----------------------------------------------------------------------------
 744 | Takes an abstract floating-point value having sign `zSign', exponent
 745 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
 746 | and returns the proper extended double-precision floating-point value
 747 | corresponding to the abstract input.  This routine is just like
 748 | `roundAndPackFloatx80' except that the input significand does not have to be
 749 | normalized.
 750 *----------------------------------------------------------------------------*/
 751
 752 static floatx80
 753  normalizeRoundAndPackFloatx80(
 754      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
 755  )
 756 {
 757     int8 shiftCount;
 758
 759     if ( zSig0 == 0 ) {
 760         zSig0 = zSig1;
 761         zSig1 = 0;
 762         zExp -= 64;
 763     }
 764     shiftCount = countLeadingZeros64( zSig0 );
 765     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
 766     zExp -= shiftCount;
 767     return
 768         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
 769
 770 }
 771
 772 #endif
 773
 774 #ifdef FLOAT128
 775
 776 /*----------------------------------------------------------------------------
 777 | Returns the least-significant 64 fraction bits of the quadruple-precision
 778 | floating-point value `a'.
 779 *----------------------------------------------------------------------------*/
 780
 781 INLINE bits64 extractFloat128Frac1( float128 a )
 782 {
 783
 784     return a.low;
 785
 786 }
 787
 788 /*----------------------------------------------------------------------------
 789 | Returns the most-significant 48 fraction bits of the quadruple-precision
 790 | floating-point value `a'.
 791 *----------------------------------------------------------------------------*/
 792
 793 INLINE bits64 extractFloat128Frac0( float128 a )
 794 {
 795
 796     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
 797
 798 }
 799
 800 /*----------------------------------------------------------------------------
 801 | Returns the exponent bits of the quadruple-precision floating-point value
 802 | `a'.
 803 *----------------------------------------------------------------------------*/
 804
 805 INLINE int32 extractFloat128Exp( float128 a )
 806 {
 807
 808     return ( a.high>>48 ) & 0x7FFF;
 809
 810 }
 811
 812 /*----------------------------------------------------------------------------
 813 | Returns the sign bit of the quadruple-precision floating-point value `a'.
 814 *----------------------------------------------------------------------------*/
 815
 816 INLINE flag extractFloat128Sign( float128 a )
 817 {
 818
 819     return a.high>>63;
 820
 821 }
 822
 823 /*----------------------------------------------------------------------------
 824 | Normalizes the subnormal quadruple-precision floating-point value
 825 | represented by the denormalized significand formed by the concatenation of
 826 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
 827 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
 828 | significand are stored at the location pointed to by `zSig0Ptr', and the
 829 | least significant 64 bits of the normalized significand are stored at the
 830 | location pointed to by `zSig1Ptr'.
 831 *----------------------------------------------------------------------------*/
 832
 833 static void
 834  normalizeFloat128Subnormal(
 835      bits64 aSig0,
 836      bits64 aSig1,
 837      int32 *zExpPtr,
 838      bits64 *zSig0Ptr,
 839      bits64 *zSig1Ptr
 840  )
 841 {
 842     int8 shiftCount;
 843
 844     if ( aSig0 == 0 ) {
 845         shiftCount = countLeadingZeros64( aSig1 ) - 15;
 846         if ( shiftCount < 0 ) {
 847             *zSig0Ptr = aSig1>>( - shiftCount );
 848             *zSig1Ptr = aSig1<<( shiftCount & 63 );
 849         }
 850         else {
 851             *zSig0Ptr = aSig1<<shiftCount;
 852             *zSig1Ptr = 0;
 853         }
 854         *zExpPtr = - shiftCount - 63;
 855     }
 856     else {
 857         shiftCount = countLeadingZeros64( aSig0 ) - 15;
 858         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
 859         *zExpPtr = 1 - shiftCount;
 860     }
 861
 862 }
 863
 864 /*----------------------------------------------------------------------------
 865 | Packs the sign `zSign', the exponent `zExp', and the significand formed
 866 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
 867 | floating-point value, returning the result.  After being shifted into the
 868 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
 869 | added together to form the most significant 32 bits of the result.  This
 870 | means that any integer portion of `zSig0' will be added into the exponent.
 871 | Since a properly normalized significand will have an integer portion equal
 872 | to 1, the `zExp' input should be 1 less than the desired result exponent
 873 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
 874 | significand.
 875 *----------------------------------------------------------------------------*/
 876
 877 INLINE float128
 878  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
 879 {
 880     float128 z;
 881
 882     z.low = zSig1;
 883     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
 884     return z;
 885
 886 }
 887
 888 /*----------------------------------------------------------------------------
 889 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
 890 | and extended significand formed by the concatenation of `zSig0', `zSig1',
 891 | and `zSig2', and returns the proper quadruple-precision floating-point value
 892 | corresponding to the abstract input.  Ordinarily, the abstract value is
 893 | simply rounded and packed into the quadruple-precision format, with the
 894 | inexact exception raised if the abstract input cannot be represented
 895 | exactly.  However, if the abstract value is too large, the overflow and
 896 | inexact exceptions are raised and an infinity or maximal finite value is
 897 | returned.  If the abstract value is too small, the input value is rounded to
 898 | a subnormal number, and the underflow and inexact exceptions are raised if
 899 | the abstract input cannot be represented exactly as a subnormal quadruple-
 900 | precision floating-point number.
 901 |     The input significand must be normalized or smaller.  If the input
 902 | significand is not normalized, `zExp' must be 0; in that case, the result
 903 | returned is a subnormal number, and it must not require rounding.  In the
 904 | usual case that the input significand is normalized, `zExp' must be 1 less
 905 | than the ``true'' floating-point exponent.  The handling of underflow and
 906 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
 907 *----------------------------------------------------------------------------*/
 908
 909 static float128
 910  roundAndPackFloat128(
 911      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
 912 {
 913     int8 roundingMode;
 914     flag roundNearestEven, increment, isTiny;
 915
 916     roundingMode = float_rounding_mode;
 917     roundNearestEven = ( roundingMode == float_round_nearest_even );
 918     increment = ( (sbits64) zSig2 < 0 );
 919     if ( ! roundNearestEven ) {
 920         if ( roundingMode == float_round_to_zero ) {
 921             increment = 0;
 922         }
 923         else {
 924             if ( zSign ) {
 925                 increment = ( roundingMode == float_round_down ) && zSig2;
 926             }
 927             else {
 928                 increment = ( roundingMode == float_round_up ) && zSig2;
 929             }
 930         }
 931     }
 932     if ( 0x7FFD <= (bits32) zExp ) {
 933         if (    ( 0x7FFD < zExp )
 934              || (    ( zExp == 0x7FFD )
 935                   && eq128(
 936                          LIT64( 0x0001FFFFFFFFFFFF ),
 937                          LIT64( 0xFFFFFFFFFFFFFFFF ),
 938                          zSig0,
 939                          zSig1
 940                      )
 941                   && increment
 942                 )
 943            ) {
 944             float_raise( float_flag_overflow | float_flag_inexact );
 945             if (    ( roundingMode == float_round_to_zero )
 946                  || ( zSign && ( roundingMode == float_round_up ) )
 947                  || ( ! zSign && ( roundingMode == float_round_down ) )
 948                ) {
 949                 return
 950                     packFloat128(
 951                         zSign,
 952                         0x7FFE,
 953                         LIT64( 0x0000FFFFFFFFFFFF ),
 954                         LIT64( 0xFFFFFFFFFFFFFFFF )
 955                     );
 956             }
 957             return packFloat128( zSign, 0x7FFF, 0, 0 );
 958         }
 959         if ( zExp < 0 ) {
 960             isTiny =
 961                    ( float_detect_tininess == float_tininess_before_rounding )
 962                 || ( zExp < -1 )
 963                 || ! increment
 964                 || lt128(
 965                        zSig0,
 966                        zSig1,
 967                        LIT64( 0x0001FFFFFFFFFFFF ),
 968                        LIT64( 0xFFFFFFFFFFFFFFFF )
 969                    );
 970             shift128ExtraRightJamming(
 971                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
 972             zExp = 0;
 973             if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
 974             if ( roundNearestEven ) {
 975                 increment = ( (sbits64) zSig2 < 0 );
 976             }
 977             else {
 978                 if ( zSign ) {
 979                     increment = ( roundingMode == float_round_down ) && zSig2;
 980                 }
 981                 else {
 982                     increment = ( roundingMode == float_round_up ) && zSig2;
 983                 }
 984             }
 985         }
 986     }
 987     if ( zSig2 ) float_exception_flags |= float_flag_inexact;
 988     if ( increment ) {
 989         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
 990         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
 991     }
 992     else {
 993         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
 994     }
 995     return packFloat128( zSign, zExp, zSig0, zSig1 );
 996
 997 }
 998
 999 /*----------------------------------------------------------------------------
1000 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1001 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1002 | returns the proper quadruple-precision floating-point value corresponding
1003 | to the abstract input.  This routine is just like `roundAndPackFloat128'
1004 | except that the input significand has fewer bits and does not have to be
1005 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
1006 | point exponent.
1007 *----------------------------------------------------------------------------*/
1008
1009 static float128
1010  normalizeRoundAndPackFloat128(
1011      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1012 {
1013     int8 shiftCount;
1014     bits64 zSig2;
1015
1016     if ( zSig0 == 0 ) {
1017         zSig0 = zSig1;
1018         zSig1 = 0;
1019         zExp -= 64;
1020     }
1021     shiftCount = countLeadingZeros64( zSig0 ) - 15;
1022     if ( 0 <= shiftCount ) {
1023         zSig2 = 0;
1024         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1025     }
1026     else {
1027         shift128ExtraRightJamming(
1028             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1029     }
1030     zExp -= shiftCount;
1031     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1032
1033 }
1034
1035 #endif
1036
1037 /*----------------------------------------------------------------------------
1038 | Returns the result of converting the 32-bit two's complement integer `a'
1039 | to the single-precision floating-point format.  The conversion is performed
1040 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1042
1043 float32 int32_to_float32( int32 a )
1044 {
1045     flag zSign;
1046
1047     if ( a == 0 ) return 0;
1048     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1049     zSign = ( a < 0 );
1050     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1051
1052 }
1053
1054 /*----------------------------------------------------------------------------
1055 | Returns the result of converting the 32-bit two's complement integer `a'
1056 | to the double-precision floating-point format.  The conversion is performed
1057 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1058 *----------------------------------------------------------------------------*/
1059
1060 float64 int32_to_float64( int32 a )
1061 {
1062     flag zSign;
1063     uint32 absA;
1064     int8 shiftCount;
1065     bits64 zSig;
1066
1067     if ( a == 0 ) return 0;
1068     zSign = ( a < 0 );
1069     absA = zSign ? - a : a;
1070     shiftCount = countLeadingZeros32( absA ) + 21;
1071     zSig = absA;
1072     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1073
1074 }
1075
1076 #ifdef FLOATX80
1077
1078 /*----------------------------------------------------------------------------
1079 | Returns the result of converting the 32-bit two's complement integer `a'
1080 | to the extended double-precision floating-point format.  The conversion
1081 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1082 | Arithmetic.
1083 *----------------------------------------------------------------------------*/
1084
1085 floatx80 int32_to_floatx80( int32 a )
1086 {
1087     flag zSign;
1088     uint32 absA;
1089     int8 shiftCount;
1090     bits64 zSig;
1091
1092     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1093     zSign = ( a < 0 );
1094     absA = zSign ? - a : a;
1095     shiftCount = countLeadingZeros32( absA ) + 32;
1096     zSig = absA;
1097     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1098
1099 }
1100
1101 #endif
1102
1103 #ifdef FLOAT128
1104
1105 /*----------------------------------------------------------------------------
1106 | Returns the result of converting the 32-bit two's complement integer `a' to
1107 | the quadruple-precision floating-point format.  The conversion is performed
1108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1109 *----------------------------------------------------------------------------*/
1110
1111 float128 int32_to_float128( int32 a )
1112 {
1113     flag zSign;
1114     uint32 absA;
1115     int8 shiftCount;
1116     bits64 zSig0;
1117
1118     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1119     zSign = ( a < 0 );
1120     absA = zSign ? - a : a;
1121     shiftCount = countLeadingZeros32( absA ) + 17;
1122     zSig0 = absA;
1123     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1124
1125 }
1126
1127 #endif
1128
1129 /*----------------------------------------------------------------------------
1130 | Returns the result of converting the 64-bit two's complement integer `a'
1131 | to the single-precision floating-point format.  The conversion is performed
1132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1133 *----------------------------------------------------------------------------*/
1134
1135 float32 int64_to_float32( int64 a )
1136 {
1137     flag zSign;
1138     uint64 absA;
1139     int8 shiftCount;
1140     bits32 zSig;
1141
1142     if ( a == 0 ) return 0;
1143     zSign = ( a < 0 );
1144     absA = zSign ? - a : a;
1145     shiftCount = countLeadingZeros64( absA ) - 40;
1146     if ( 0 <= shiftCount ) {
1147         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1148     }
1149     else {
1150         shiftCount += 7;
1151         if ( shiftCount < 0 ) {
1152             shift64RightJamming( absA, - shiftCount, &absA );
1153         }
1154         else {
1155             absA <<= shiftCount;
1156         }
1157         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1158     }
1159
1160 }
1161
1162 /*----------------------------------------------------------------------------
1163 | Returns the result of converting the 64-bit two's complement integer `a'
1164 | to the double-precision floating-point format.  The conversion is performed
1165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1166 *----------------------------------------------------------------------------*/
1167
1168 float64 int64_to_float64( int64 a )
1169 {
1170     flag zSign;
1171
1172     if ( a == 0 ) return 0;
1173     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1174         return packFloat64( 1, 0x43E, 0 );
1175     }
1176     zSign = ( a < 0 );
1177     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1178
1179 }
1180
1181 #ifdef FLOATX80
1182
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 64-bit two's complement integer `a'
1185 | to the extended double-precision floating-point format.  The conversion
1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 | Arithmetic.
1188 *----------------------------------------------------------------------------*/
1189
1190 floatx80 int64_to_floatx80( int64 a )
1191 {
1192     flag zSign;
1193     uint64 absA;
1194     int8 shiftCount;
1195
1196     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1197     zSign = ( a < 0 );
1198     absA = zSign ? - a : a;
1199     shiftCount = countLeadingZeros64( absA );
1200     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1201
1202 }
1203
1204 #endif
1205
1206 #ifdef FLOAT128
1207
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a' to
1210 | the quadruple-precision floating-point format.  The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float128 int64_to_float128( int64 a )
1215 {
1216     flag zSign;
1217     uint64 absA;
1218     int8 shiftCount;
1219     int32 zExp;
1220     bits64 zSig0, zSig1;
1221
1222     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1223     zSign = ( a < 0 );
1224     absA = zSign ? - a : a;
1225     shiftCount = countLeadingZeros64( absA ) + 49;
1226     zExp = 0x406E - shiftCount;
1227     if ( 64 <= shiftCount ) {
1228         zSig1 = 0;
1229         zSig0 = absA;
1230         shiftCount -= 64;
1231     }
1232     else {
1233         zSig1 = absA;
1234         zSig0 = 0;
1235     }
1236     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1237     return packFloat128( zSign, zExp, zSig0, zSig1 );
1238
1239 }
1240
1241 #endif
1242
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the single-precision floating-point value
1245 | `a' to the 32-bit two's complement integer format.  The conversion is
1246 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1247 | Arithmetic---which means in particular that the conversion is rounded
1248 | according to the current rounding mode.  If `a' is a NaN, the largest
1249 | positive integer is returned.  Otherwise, if the conversion overflows, the
1250 | largest integer with the same sign as `a' is returned.
1251 *----------------------------------------------------------------------------*/
1252
1253 int32 float32_to_int32( float32 a )
1254 {
1255     flag aSign;
1256     int16 aExp, shiftCount;
1257     bits32 aSig;
1258     bits64 aSig64;
1259
1260     aSig = extractFloat32Frac( a );
1261     aExp = extractFloat32Exp( a );
1262     aSign = extractFloat32Sign( a );
1263     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1264     if ( aExp ) aSig |= 0x00800000;
1265     shiftCount = 0xAF - aExp;
1266     aSig64 = aSig;
1267     aSig64 <<= 32;
1268     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1269     return roundAndPackInt32( aSign, aSig64 );
1270
1271 }
1272
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the single-precision floating-point value
1275 | `a' to the 32-bit two's complement integer format.  The conversion is
1276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 | Arithmetic, except that the conversion is always rounded toward zero.
1278 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
1279 | the conversion overflows, the largest integer with the same sign as `a' is
1280 | returned.
1281 *----------------------------------------------------------------------------*/
1282
1283 int32 float32_to_int32_round_to_zero( float32 a )
1284 {
1285     flag aSign;
1286     int16 aExp, shiftCount;
1287     bits32 aSig;
1288     int32 z;
1289
1290     aSig = extractFloat32Frac( a );
1291     aExp = extractFloat32Exp( a );
1292     aSign = extractFloat32Sign( a );
1293     shiftCount = aExp - 0x9E;
1294     if ( 0 <= shiftCount ) {
1295         if ( a != 0xCF000000 ) {
1296             float_raise( float_flag_invalid );
1297             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1298         }
1299         return (sbits32) 0x80000000;
1300     }
1301     else if ( aExp <= 0x7E ) {
1302         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1303         return 0;
1304     }
1305     aSig = ( aSig | 0x00800000 )<<8;
1306     z = aSig>>( - shiftCount );
1307     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1308         float_exception_flags |= float_flag_inexact;
1309     }
1310     if ( aSign ) z = - z;
1311     return z;
1312
1313 }
1314
1315 /*----------------------------------------------------------------------------
1316 | Returns the result of converting the single-precision floating-point value
1317 | `a' to the 64-bit two's complement integer format.  The conversion is
1318 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1319 | Arithmetic---which means in particular that the conversion is rounded
1320 | according to the current rounding mode.  If `a' is a NaN, the largest
1321 | positive integer is returned.  Otherwise, if the conversion overflows, the
1322 | largest integer with the same sign as `a' is returned.
1323 *----------------------------------------------------------------------------*/
1324
1325 int64 float32_to_int64( float32 a )
1326 {
1327     flag aSign;
1328     int16 aExp, shiftCount;
1329     bits32 aSig;
1330     bits64 aSig64, aSigExtra;
1331
1332     aSig = extractFloat32Frac( a );
1333     aExp = extractFloat32Exp( a );
1334     aSign = extractFloat32Sign( a );
1335     shiftCount = 0xBE - aExp;
1336     if ( shiftCount < 0 ) {
1337         float_raise( float_flag_invalid );
1338         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1339             return LIT64( 0x7FFFFFFFFFFFFFFF );
1340         }
1341         return (sbits64) LIT64( 0x8000000000000000 );
1342     }
1343     if ( aExp ) aSig |= 0x00800000;
1344     aSig64 = aSig;
1345     aSig64 <<= 40;
1346     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1347     return roundAndPackInt64( aSign, aSig64, aSigExtra );
1348
1349 }
1350
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the single-precision floating-point value
1353 | `a' to the 64-bit two's complement integer format.  The conversion is
1354 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1355 | Arithmetic, except that the conversion is always rounded toward zero.  If
1356 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
1357 | conversion overflows, the largest integer with the same sign as `a' is
1358 | returned.
1359 *----------------------------------------------------------------------------*/
1360
1361 int64 float32_to_int64_round_to_zero( float32 a )
1362 {
1363     flag aSign;
1364     int16 aExp, shiftCount;
1365     bits32 aSig;
1366     bits64 aSig64;
1367     int64 z;
1368
1369     aSig = extractFloat32Frac( a );
1370     aExp = extractFloat32Exp( a );
1371     aSign = extractFloat32Sign( a );
1372     shiftCount = aExp - 0xBE;
1373     if ( 0 <= shiftCount ) {
1374         if ( a != 0xDF000000 ) {
1375             float_raise( float_flag_invalid );
1376             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1377                 return LIT64( 0x7FFFFFFFFFFFFFFF );
1378             }
1379         }
1380         return (sbits64) LIT64( 0x8000000000000000 );
1381     }
1382     else if ( aExp <= 0x7E ) {
1383         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1384         return 0;
1385     }
1386     aSig64 = aSig | 0x00800000;
1387     aSig64 <<= 40;
1388     z = aSig64>>( - shiftCount );
1389     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1390         float_exception_flags |= float_flag_inexact;
1391     }
1392     if ( aSign ) z = - z;
1393     return z;
1394
1395 }
1396
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the double-precision floating-point format.  The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 | Arithmetic.
1402 *----------------------------------------------------------------------------*/
1403
1404 float64 float32_to_float64( float32 a )
1405 {
1406     flag aSign;
1407     int16 aExp;
1408     bits32 aSig;
1409
1410     aSig = extractFloat32Frac( a );
1411     aExp = extractFloat32Exp( a );
1412     aSign = extractFloat32Sign( a );
1413     if ( aExp == 0xFF ) {
1414         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1415         return packFloat64( aSign, 0x7FF, 0 );
1416     }
1417     if ( aExp == 0 ) {
1418         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1419         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1420         --aExp;
1421     }
1422     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1423
1424 }
1425
1426 #ifdef FLOATX80
1427
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the single-precision floating-point value
1430 | `a' to the extended double-precision floating-point format.  The conversion
1431 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1432 | Arithmetic.
1433 *----------------------------------------------------------------------------*/
1434
1435 floatx80 float32_to_floatx80( float32 a )
1436 {
1437     flag aSign;
1438     int16 aExp;
1439     bits32 aSig;
1440
1441     aSig = extractFloat32Frac( a );
1442     aExp = extractFloat32Exp( a );
1443     aSign = extractFloat32Sign( a );
1444     if ( aExp == 0xFF ) {
1445         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1446         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1447     }
1448     if ( aExp == 0 ) {
1449         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1450         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1451     }
1452     aSig |= 0x00800000;
1453     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1454
1455 }
1456
1457 #endif
1458
1459 #ifdef FLOAT128
1460
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the double-precision floating-point format.  The conversion is
1464 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1465 | Arithmetic.
1466 *----------------------------------------------------------------------------*/
1467
1468 float128 float32_to_float128( float32 a )
1469 {
1470     flag aSign;
1471     int16 aExp;
1472     bits32 aSig;
1473
1474     aSig = extractFloat32Frac( a );
1475     aExp = extractFloat32Exp( a );
1476     aSign = extractFloat32Sign( a );
1477     if ( aExp == 0xFF ) {
1478         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1479         return packFloat128( aSign, 0x7FFF, 0, 0 );
1480     }
1481     if ( aExp == 0 ) {
1482         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1483         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1484         --aExp;
1485     }
1486     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1487
1488 }
1489
1490 #endif
1491
1492 /*----------------------------------------------------------------------------
1493 | Rounds the single-precision floating-point value `a' to an integer, and
1494 | returns the result as a single-precision floating-point value.  The
1495 | operation is performed according to the IEC/IEEE Standard for Binary
1496 | Floating-Point Arithmetic.
1497 *----------------------------------------------------------------------------*/
1498
1499 float32 float32_round_to_int( float32 a )
1500 {
1501     flag aSign;
1502     int16 aExp;
1503     bits32 lastBitMask, roundBitsMask;
1504     int8 roundingMode;
1505     float32 z;
1506
1507     aExp = extractFloat32Exp( a );
1508     if ( 0x96 <= aExp ) {
1509         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1510             return propagateFloat32NaN( a, a );
1511         }
1512         return a;
1513     }
1514     if ( aExp <= 0x7E ) {
1515         if ( (bits32) ( a<<1 ) == 0 ) return a;
1516         float_exception_flags |= float_flag_inexact;
1517         aSign = extractFloat32Sign( a );
1518         switch ( float_rounding_mode ) {
1519          case float_round_nearest_even:
1520             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1521                 return packFloat32( aSign, 0x7F, 0 );
1522             }
1523             break;
1524          case float_round_down:
1525             return aSign ? 0xBF800000 : 0;
1526          case float_round_up:
1527             return aSign ? 0x80000000 : 0x3F800000;
1528         }
1529         return packFloat32( aSign, 0, 0 );
1530     }
1531     lastBitMask = 1;
1532     lastBitMask <<= 0x96 - aExp;
1533     roundBitsMask = lastBitMask - 1;
1534     z = a;
1535     roundingMode = float_rounding_mode;
1536     if ( roundingMode == float_round_nearest_even ) {
1537         z += lastBitMask>>1;
1538         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1539     }
1540     else if ( roundingMode != float_round_to_zero ) {
1541         if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1542             z += roundBitsMask;
1543         }
1544     }
1545     z &= ~ roundBitsMask;
1546     if ( z != a ) float_exception_flags |= float_flag_inexact;
1547     return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of adding the absolute values of the single-precision
1553 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
1554 | before being returned.  `zSign' is ignored if the result is a NaN.
1555 | The addition is performed according to the IEC/IEEE Standard for Binary
1556 | Floating-Point Arithmetic.
1557 *----------------------------------------------------------------------------*/
1558
1559 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1560 {
1561     int16 aExp, bExp, zExp;
1562     bits32 aSig, bSig, zSig;
1563     int16 expDiff;
1564
1565     aSig = extractFloat32Frac( a );
1566     aExp = extractFloat32Exp( a );
1567     bSig = extractFloat32Frac( b );
1568     bExp = extractFloat32Exp( b );
1569     expDiff = aExp - bExp;
1570     aSig <<= 6;
1571     bSig <<= 6;
1572     if ( 0 < expDiff ) {
1573         if ( aExp == 0xFF ) {
1574             if ( aSig ) return propagateFloat32NaN( a, b );
1575             return a;
1576         }
1577         if ( bExp == 0 ) {
1578             --expDiff;
1579         }
1580         else {
1581             bSig |= 0x20000000;
1582         }
1583         shift32RightJamming( bSig, expDiff, &bSig );
1584         zExp = aExp;
1585     }
1586     else if ( expDiff < 0 ) {
1587         if ( bExp == 0xFF ) {
1588             if ( bSig ) return propagateFloat32NaN( a, b );
1589             return packFloat32( zSign, 0xFF, 0 );
1590         }
1591         if ( aExp == 0 ) {
1592             ++expDiff;
1593         }
1594         else {
1595             aSig |= 0x20000000;
1596         }
1597         shift32RightJamming( aSig, - expDiff, &aSig );
1598         zExp = bExp;
1599     }
1600     else {
1601         if ( aExp == 0xFF ) {
1602             if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1603             return a;
1604         }
1605         if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1606         zSig = 0x40000000 + aSig + bSig;
1607         zExp = aExp;
1608         goto roundAndPack;
1609     }
1610     aSig |= 0x20000000;
1611     zSig = ( aSig + bSig )<<1;
1612     --zExp;
1613     if ( (sbits32) zSig < 0 ) {
1614         zSig = aSig + bSig;
1615         ++zExp;
1616     }
1617  roundAndPack:
1618     return roundAndPackFloat32( zSign, zExp, zSig );
1619
1620 }
1621
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of subtracting the absolute values of the single-
1624 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
1625 | difference is negated before being returned.  `zSign' is ignored if the
1626 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
1627 | Standard for Binary Floating-Point Arithmetic.
1628 *----------------------------------------------------------------------------*/
1629
1630 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1631 {
1632     int16 aExp, bExp, zExp;
1633     bits32 aSig, bSig, zSig;
1634     int16 expDiff;
1635
1636     aSig = extractFloat32Frac( a );
1637     aExp = extractFloat32Exp( a );
1638     bSig = extractFloat32Frac( b );
1639     bExp = extractFloat32Exp( b );
1640     expDiff = aExp - bExp;
1641     aSig <<= 7;
1642     bSig <<= 7;
1643     if ( 0 < expDiff ) goto aExpBigger;
1644     if ( expDiff < 0 ) goto bExpBigger;
1645     if ( aExp == 0xFF ) {
1646         if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1647         float_raise( float_flag_invalid );
1648         return float32_default_nan;
1649     }
1650     if ( aExp == 0 ) {
1651         aExp = 1;
1652         bExp = 1;
1653     }
1654     if ( bSig < aSig ) goto aBigger;
1655     if ( aSig < bSig ) goto bBigger;
1656     return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1657  bExpBigger:
1658     if ( bExp == 0xFF ) {
1659         if ( bSig ) return propagateFloat32NaN( a, b );
1660         return packFloat32( zSign ^ 1, 0xFF, 0 );
1661     }
1662     if ( aExp == 0 ) {
1663         ++expDiff;
1664     }
1665     else {
1666         aSig |= 0x40000000;
1667     }
1668     shift32RightJamming( aSig, - expDiff, &aSig );
1669     bSig |= 0x40000000;
1670  bBigger:
1671     zSig = bSig - aSig;
1672     zExp = bExp;
1673     zSign ^= 1;
1674     goto normalizeRoundAndPack;
1675  aExpBigger:
1676     if ( aExp == 0xFF ) {
1677         if ( aSig ) return propagateFloat32NaN( a, b );
1678         return a;
1679     }
1680     if ( bExp == 0 ) {
1681         --expDiff;
1682     }
1683     else {
1684         bSig |= 0x40000000;
1685     }
1686     shift32RightJamming( bSig, expDiff, &bSig );
1687     aSig |= 0x40000000;
1688  aBigger:
1689     zSig = aSig - bSig;
1690     zExp = aExp;
1691  normalizeRoundAndPack:
1692     --zExp;
1693     return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1694
1695 }
1696
1697 /*----------------------------------------------------------------------------
1698 | Returns the result of adding the single-precision floating-point values `a'
1699 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
1700 | Binary Floating-Point Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702
1703 float32 float32_add( float32 a, float32 b )
1704 {
1705     flag aSign, bSign;
1706
1707     aSign = extractFloat32Sign( a );
1708     bSign = extractFloat32Sign( b );
1709     if ( aSign == bSign ) {
1710         return addFloat32Sigs( a, b, aSign );
1711     }
1712     else {
1713         return subFloat32Sigs( a, b, aSign );
1714     }
1715
1716 }
1717
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of subtracting the single-precision floating-point values
1720 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1721 | for Binary Floating-Point Arithmetic.
1722 *----------------------------------------------------------------------------*/
1723
1724 float32 float32_sub( float32 a, float32 b )
1725 {
1726     flag aSign, bSign;
1727
1728     aSign = extractFloat32Sign( a );
1729     bSign = extractFloat32Sign( b );
1730     if ( aSign == bSign ) {
1731         return subFloat32Sigs( a, b, aSign );
1732     }
1733     else {
1734         return addFloat32Sigs( a, b, aSign );
1735     }
1736
1737 }
1738
1739 /*----------------------------------------------------------------------------
1740 | Returns the result of multiplying the single-precision floating-point values
1741 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
1742 | for Binary Floating-Point Arithmetic.
1743 *----------------------------------------------------------------------------*/
1744
1745 float32 float32_mul( float32 a, float32 b )
1746 {
1747     flag aSign, bSign, zSign;
1748     int16 aExp, bExp, zExp;
1749     bits32 aSig, bSig;
1750     bits64 zSig64;
1751     bits32 zSig;
1752
1753     aSig = extractFloat32Frac( a );
1754     aExp = extractFloat32Exp( a );
1755     aSign = extractFloat32Sign( a );
1756     bSig = extractFloat32Frac( b );
1757     bExp = extractFloat32Exp( b );
1758     bSign = extractFloat32Sign( b );
1759     zSign = aSign ^ bSign;
1760     if ( aExp == 0xFF ) {
1761         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1762             return propagateFloat32NaN( a, b );
1763         }
1764         if ( ( bExp | bSig ) == 0 ) {
1765             float_raise( float_flag_invalid );
1766             return float32_default_nan;
1767         }
1768         return packFloat32( zSign, 0xFF, 0 );
1769     }
1770     if ( bExp == 0xFF ) {
1771         if ( bSig ) return propagateFloat32NaN( a, b );
1772         if ( ( aExp | aSig ) == 0 ) {
1773             float_raise( float_flag_invalid );
1774             return float32_default_nan;
1775         }
1776         return packFloat32( zSign, 0xFF, 0 );
1777     }
1778     if ( aExp == 0 ) {
1779         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1780         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1781     }
1782     if ( bExp == 0 ) {
1783         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1784         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1785     }
1786     zExp = aExp + bExp - 0x7F;
1787     aSig = ( aSig | 0x00800000 )<<7;
1788     bSig = ( bSig | 0x00800000 )<<8;
1789     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1790     zSig = zSig64;
1791     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1792         zSig <<= 1;
1793         --zExp;
1794     }
1795     return roundAndPackFloat32( zSign, zExp, zSig );
1796
1797 }
1798
1799 /*----------------------------------------------------------------------------
1800 | Returns the result of dividing the single-precision floating-point value `a'
1801 | by the corresponding value `b'.  The operation is performed according to the
1802 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1803 *----------------------------------------------------------------------------*/
1804
1805 float32 float32_div( float32 a, float32 b )
1806 {
1807     flag aSign, bSign, zSign;
1808     int16 aExp, bExp, zExp;
1809     bits32 aSig, bSig, zSig;
1810
1811     aSig = extractFloat32Frac( a );
1812     aExp = extractFloat32Exp( a );
1813     aSign = extractFloat32Sign( a );
1814     bSig = extractFloat32Frac( b );
1815     bExp = extractFloat32Exp( b );
1816     bSign = extractFloat32Sign( b );
1817     zSign = aSign ^ bSign;
1818     if ( aExp == 0xFF ) {
1819         if ( aSig ) return propagateFloat32NaN( a, b );
1820         if ( bExp == 0xFF ) {
1821             if ( bSig ) return propagateFloat32NaN( a, b );
1822             float_raise( float_flag_invalid );
1823             return float32_default_nan;
1824         }
1825         return packFloat32( zSign, 0xFF, 0 );
1826     }
1827     if ( bExp == 0xFF ) {
1828         if ( bSig ) return propagateFloat32NaN( a, b );
1829         return packFloat32( zSign, 0, 0 );
1830     }
1831     if ( bExp == 0 ) {
1832         if ( bSig == 0 ) {
1833             if ( ( aExp | aSig ) == 0 ) {
1834                 float_raise( float_flag_invalid );
1835                 return float32_default_nan;
1836             }
1837             float_raise( float_flag_divbyzero );
1838             return packFloat32( zSign, 0xFF, 0 );
1839         }
1840         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1841     }
1842     if ( aExp == 0 ) {
1843         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1844         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1845     }
1846     zExp = aExp - bExp + 0x7D;
1847     aSig = ( aSig | 0x00800000 )<<7;
1848     bSig = ( bSig | 0x00800000 )<<8;
1849     if ( bSig <= ( aSig + aSig ) ) {
1850         aSig >>= 1;
1851         ++zExp;
1852     }
1853     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1854     if ( ( zSig & 0x3F ) == 0 ) {
1855         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1856     }
1857     return roundAndPackFloat32( zSign, zExp, zSig );
1858
1859 }
1860
1861 /*----------------------------------------------------------------------------
1862 | Returns the remainder of the single-precision floating-point value `a'
1863 | with respect to the corresponding value `b'.  The operation is performed
1864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1866
1867 float32 float32_rem( float32 a, float32 b )
1868 {
1869     flag aSign, bSign, zSign;
1870     int16 aExp, bExp, expDiff;
1871     bits32 aSig, bSig;
1872     bits32 q;
1873     bits64 aSig64, bSig64, q64;
1874     bits32 alternateASig;
1875     sbits32 sigMean;
1876
1877     aSig = extractFloat32Frac( a );
1878     aExp = extractFloat32Exp( a );
1879     aSign = extractFloat32Sign( a );
1880     bSig = extractFloat32Frac( b );
1881     bExp = extractFloat32Exp( b );
1882     bSign = extractFloat32Sign( b );
1883     if ( aExp == 0xFF ) {
1884         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1885             return propagateFloat32NaN( a, b );
1886         }
1887         float_raise( float_flag_invalid );
1888         return float32_default_nan;
1889     }
1890     if ( bExp == 0xFF ) {
1891         if ( bSig ) return propagateFloat32NaN( a, b );
1892         return a;
1893     }
1894     if ( bExp == 0 ) {
1895         if ( bSig == 0 ) {
1896             float_raise( float_flag_invalid );
1897             return float32_default_nan;
1898         }
1899         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1900     }
1901     if ( aExp == 0 ) {
1902         if ( aSig == 0 ) return a;
1903         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1904     }
1905     expDiff = aExp - bExp;
1906     aSig |= 0x00800000;
1907     bSig |= 0x00800000;
1908     if ( expDiff < 32 ) {
1909         aSig <<= 8;
1910         bSig <<= 8;
1911         if ( expDiff < 0 ) {
1912             if ( expDiff < -1 ) return a;
1913             aSig >>= 1;
1914         }
1915         q = ( bSig <= aSig );
1916         if ( q ) aSig -= bSig;
1917         if ( 0 < expDiff ) {
1918             q = ( ( (bits64) aSig )<<32 ) / bSig;
1919             q >>= 32 - expDiff;
1920             bSig >>= 2;
1921             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1922         }
1923         else {
1924             aSig >>= 2;
1925             bSig >>= 2;
1926         }
1927     }
1928     else {
1929         if ( bSig <= aSig ) aSig -= bSig;
1930         aSig64 = ( (bits64) aSig )<<40;
1931         bSig64 = ( (bits64) bSig )<<40;
1932         expDiff -= 64;
1933         while ( 0 < expDiff ) {
1934             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1935             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1936             aSig64 = - ( ( bSig * q64 )<<38 );
1937             expDiff -= 62;
1938         }
1939         expDiff += 64;
1940         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1941         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1942         q = q64>>( 64 - expDiff );
1943         bSig <<= 6;
1944         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1945     }
1946     do {
1947         alternateASig = aSig;
1948         ++q;
1949         aSig -= bSig;
1950     } while ( 0 <= (sbits32) aSig );
1951     sigMean = aSig + alternateASig;
1952     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1953         aSig = alternateASig;
1954     }
1955     zSign = ( (sbits32) aSig < 0 );
1956     if ( zSign ) aSig = - aSig;
1957     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1958
1959 }
1960
1961 /*----------------------------------------------------------------------------
1962 | Returns the square root of the single-precision floating-point value `a'.
1963 | The operation is performed according to the IEC/IEEE Standard for Binary
1964 | Floating-Point Arithmetic.
1965 *----------------------------------------------------------------------------*/
1966
1967 float32 float32_sqrt( float32 a )
1968 {
1969     flag aSign;
1970     int16 aExp, zExp;
1971     bits32 aSig, zSig;
1972     bits64 rem, term;
1973
1974     aSig = extractFloat32Frac( a );
1975     aExp = extractFloat32Exp( a );
1976     aSign = extractFloat32Sign( a );
1977     if ( aExp == 0xFF ) {
1978         if ( aSig ) return propagateFloat32NaN( a, 0 );
1979         if ( ! aSign ) return a;
1980         float_raise( float_flag_invalid );
1981         return float32_default_nan;
1982     }
1983     if ( aSign ) {
1984         if ( ( aExp | aSig ) == 0 ) return a;
1985         float_raise( float_flag_invalid );
1986         return float32_default_nan;
1987     }
1988     if ( aExp == 0 ) {
1989         if ( aSig == 0 ) return 0;
1990         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1991     }
1992     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1993     aSig = ( aSig | 0x00800000 )<<8;
1994     zSig = estimateSqrt32( aExp, aSig ) + 2;
1995     if ( ( zSig & 0x7F ) <= 5 ) {
1996         if ( zSig < 2 ) {
1997             zSig = 0x7FFFFFFF;
1998             goto roundAndPack;
1999         }
2000         aSig >>= aExp & 1;
2001         term = ( (bits64) zSig ) * zSig;
2002         rem = ( ( (bits64) aSig )<<32 ) - term;
2003         while ( (sbits64) rem < 0 ) {
2004             --zSig;
2005             rem += ( ( (bits64) zSig )<<1 ) | 1;
2006         }
2007         zSig |= ( rem != 0 );
2008     }
2009     shift32RightJamming( zSig, 1, &zSig );
2010  roundAndPack:
2011     return roundAndPackFloat32( 0, zExp, zSig );
2012
2013 }
2014
2015 /*----------------------------------------------------------------------------
2016 | Returns 1 if the single-precision floating-point value `a' is equal to
2017 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 *----------------------------------------------------------------------------*/
2020
2021 flag float32_eq( float32 a, float32 b )
2022 {
2023
2024     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2025          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2026        ) {
2027         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2028             float_raise( float_flag_invalid );
2029         }
2030         return 0;
2031     }
2032     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2033
2034 }
2035
2036 /*----------------------------------------------------------------------------
2037 | Returns 1 if the single-precision floating-point value `a' is less than
2038 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2040 | Arithmetic.
2041 *----------------------------------------------------------------------------*/
2042
2043 flag float32_le( float32 a, float32 b )
2044 {
2045     flag aSign, bSign;
2046
2047     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2048          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2049        ) {
2050         float_raise( float_flag_invalid );
2051         return 0;
2052     }
2053     aSign = extractFloat32Sign( a );
2054     bSign = extractFloat32Sign( b );
2055     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2056     return ( a == b ) || ( aSign ^ ( a < b ) );
2057
2058 }
2059
2060 /*----------------------------------------------------------------------------
2061 | Returns 1 if the single-precision floating-point value `a' is less than
2062 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2064 *----------------------------------------------------------------------------*/
2065
2066 flag float32_lt( float32 a, float32 b )
2067 {
2068     flag aSign, bSign;
2069
2070     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2071          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2072        ) {
2073         float_raise( float_flag_invalid );
2074         return 0;
2075     }
2076     aSign = extractFloat32Sign( a );
2077     bSign = extractFloat32Sign( b );
2078     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2079     return ( a != b ) && ( aSign ^ ( a < b ) );
2080
2081 }
2082
2083 /*----------------------------------------------------------------------------
2084 | Returns 1 if the single-precision floating-point value `a' is equal to
2085 | the corresponding value `b', and 0 otherwise.  The invalid exception is
2086 | raised if either operand is a NaN.  Otherwise, the comparison is performed
2087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088 *----------------------------------------------------------------------------*/
2089
2090 flag float32_eq_signaling( float32 a, float32 b )
2091 {
2092
2093     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2095        ) {
2096         float_raise( float_flag_invalid );
2097         return 0;
2098     }
2099     return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2100
2101 }
2102
2103 /*----------------------------------------------------------------------------
2104 | Returns 1 if the single-precision floating-point value `a' is less than or
2105 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
2106 | cause an exception.  Otherwise, the comparison is performed according to the
2107 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2108 *----------------------------------------------------------------------------*/
2109
2110 flag float32_le_quiet( float32 a, float32 b )
2111 {
2112     flag aSign, bSign;
2113     int16 aExp, bExp;
2114
2115     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117        ) {
2118         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119             float_raise( float_flag_invalid );
2120         }
2121         return 0;
2122     }
2123     aSign = extractFloat32Sign( a );
2124     bSign = extractFloat32Sign( b );
2125     if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2126     return ( a == b ) || ( aSign ^ ( a < b ) );
2127
2128 }
2129
2130 /*----------------------------------------------------------------------------
2131 | Returns 1 if the single-precision floating-point value `a' is less than
2132 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
2133 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
2134 | Standard for Binary Floating-Point Arithmetic.
2135 *----------------------------------------------------------------------------*/
2136
2137 flag float32_lt_quiet( float32 a, float32 b )
2138 {
2139     flag aSign, bSign;
2140
2141     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2142          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2143        ) {
2144         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2145             float_raise( float_flag_invalid );
2146         }
2147         return 0;
2148     }
2149     aSign = extractFloat32Sign( a );
2150     bSign = extractFloat32Sign( b );
2151     if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2152     return ( a != b ) && ( aSign ^ ( a < b ) );
2153
2154 }
2155
2156 /*----------------------------------------------------------------------------
2157 | Returns the result of converting the double-precision floating-point value
2158 | `a' to the 32-bit two's complement integer format.  The conversion is
2159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2160 | Arithmetic---which means in particular that the conversion is rounded
2161 | according to the current rounding mode.  If `a' is a NaN, the largest
2162 | positive integer is returned.  Otherwise, if the conversion overflows, the
2163 | largest integer with the same sign as `a' is returned.
2164 *----------------------------------------------------------------------------*/
2165
2166 int32 float64_to_int32( float64 a )
2167 {
2168     flag aSign;
2169     int16 aExp, shiftCount;
2170     bits64 aSig;
2171
2172     aSig = extractFloat64Frac( a );
2173     aExp = extractFloat64Exp( a );
2174     aSign = extractFloat64Sign( a );
2175     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2176     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2177     shiftCount = 0x42C - aExp;
2178     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2179     return roundAndPackInt32( aSign, aSig );
2180
2181 }
2182
2183 /*----------------------------------------------------------------------------
2184 | Returns the result of converting the double-precision floating-point value
2185 | `a' to the 32-bit two's complement integer format.  The conversion is
2186 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2187 | Arithmetic, except that the conversion is always rounded toward zero.
2188 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2189 | the conversion overflows, the largest integer with the same sign as `a' is
2190 | returned.
2191 *----------------------------------------------------------------------------*/
2192
2193 int32 float64_to_int32_round_to_zero( float64 a )
2194 {
2195     flag aSign;
2196     int16 aExp, shiftCount;
2197     bits64 aSig, savedASig;
2198     int32 z;
2199
2200     aSig = extractFloat64Frac( a );
2201     aExp = extractFloat64Exp( a );
2202     aSign = extractFloat64Sign( a );
2203     if ( 0x41E < aExp ) {
2204         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2205         goto invalid;
2206     }
2207     else if ( aExp < 0x3FF ) {
2208         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2209         return 0;
2210     }
2211     aSig |= LIT64( 0x0010000000000000 );
2212     shiftCount = 0x433 - aExp;
2213     savedASig = aSig;
2214     aSig >>= shiftCount;
2215     z = aSig;
2216     if ( aSign ) z = - z;
2217     if ( ( z < 0 ) ^ aSign ) {
2218  invalid:
2219         float_raise( float_flag_invalid );
2220         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2221     }
2222     if ( ( aSig<<shiftCount ) != savedASig ) {
2223         float_exception_flags |= float_flag_inexact;
2224     }
2225     return z;
2226
2227 }
2228
2229 /*----------------------------------------------------------------------------
2230 | Returns the result of converting the double-precision floating-point value
2231 | `a' to the 64-bit two's complement integer format.  The conversion is
2232 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2233 | Arithmetic---which means in particular that the conversion is rounded
2234 | according to the current rounding mode.  If `a' is a NaN, the largest
2235 | positive integer is returned.  Otherwise, if the conversion overflows, the
2236 | largest integer with the same sign as `a' is returned.
2237 *----------------------------------------------------------------------------*/
2238
2239 int64 float64_to_int64( float64 a )
2240 {
2241     flag aSign;
2242     int16 aExp, shiftCount;
2243     bits64 aSig, aSigExtra;
2244
2245     aSig = extractFloat64Frac( a );
2246     aExp = extractFloat64Exp( a );
2247     aSign = extractFloat64Sign( a );
2248     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2249     shiftCount = 0x433 - aExp;
2250     if ( shiftCount <= 0 ) {
2251         if ( 0x43E < aExp ) {
2252             float_raise( float_flag_invalid );
2253             if (    ! aSign
2254                  || (    ( aExp == 0x7FF )
2255                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
2256                ) {
2257                 return LIT64( 0x7FFFFFFFFFFFFFFF );
2258             }
2259             return (sbits64) LIT64( 0x8000000000000000 );
2260         }
2261         aSigExtra = 0;
2262         aSig <<= - shiftCount;
2263     }
2264     else {
2265         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2266     }
2267     return roundAndPackInt64( aSign, aSig, aSigExtra );
2268
2269 }
2270
2271 /*----------------------------------------------------------------------------
2272 | Returns the result of converting the double-precision floating-point value
2273 | `a' to the 64-bit two's complement integer format.  The conversion is
2274 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2275 | Arithmetic, except that the conversion is always rounded toward zero.
2276 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
2277 | the conversion overflows, the largest integer with the same sign as `a' is
2278 | returned.
2279 *----------------------------------------------------------------------------*/
2280
2281 int64 float64_to_int64_round_to_zero( float64 a )
2282 {
2283     flag aSign;
2284     int16 aExp, shiftCount;
2285     bits64 aSig;
2286     int64 z;
2287
2288     aSig = extractFloat64Frac( a );
2289     aExp = extractFloat64Exp( a );
2290     aSign = extractFloat64Sign( a );
2291     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2292     shiftCount = aExp - 0x433;
2293     if ( 0 <= shiftCount ) {
2294         if ( 0x43E <= aExp ) {
2295             if ( a != LIT64( 0xC3E0000000000000 ) ) {
2296                 float_raise( float_flag_invalid );
2297                 if (    ! aSign
2298                      || (    ( aExp == 0x7FF )
2299                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
2300                    ) {
2301                     return LIT64( 0x7FFFFFFFFFFFFFFF );
2302                 }
2303             }
2304             return (sbits64) LIT64( 0x8000000000000000 );
2305         }
2306         z = aSig<<shiftCount;
2307     }
2308     else {
2309         if ( aExp < 0x3FE ) {
2310             if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2311             return 0;
2312         }
2313         z = aSig>>( - shiftCount );
2314         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2315             float_exception_flags |= float_flag_inexact;
2316         }
2317     }
2318     if ( aSign ) z = - z;
2319     return z;
2320
2321 }
2322
2323 /*----------------------------------------------------------------------------
2324 | Returns the result of converting the double-precision floating-point value
2325 | `a' to the single-precision floating-point format.  The conversion is
2326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 | Arithmetic.
2328 *----------------------------------------------------------------------------*/
2329
2330 float32 float64_to_float32( float64 a )
2331 {
2332     flag aSign;
2333     int16 aExp;
2334     bits64 aSig;
2335     bits32 zSig;
2336
2337     aSig = extractFloat64Frac( a );
2338     aExp = extractFloat64Exp( a );
2339     aSign = extractFloat64Sign( a );
2340     if ( aExp == 0x7FF ) {
2341         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2342         return packFloat32( aSign, 0xFF, 0 );
2343     }
2344     shift64RightJamming( aSig, 22, &aSig );
2345     zSig = aSig;
2346     if ( aExp || zSig ) {
2347         zSig |= 0x40000000;
2348         aExp -= 0x381;
2349     }
2350     return roundAndPackFloat32( aSign, aExp, zSig );
2351
2352 }
2353
2354 #ifdef FLOATX80
2355
2356 /*----------------------------------------------------------------------------
2357 | Returns the result of converting the double-precision floating-point value
2358 | `a' to the extended double-precision floating-point format.  The conversion
2359 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 | Arithmetic.
2361 *----------------------------------------------------------------------------*/
2362
2363 floatx80 float64_to_floatx80( float64 a )
2364 {
2365     flag aSign;
2366     int16 aExp;
2367     bits64 aSig;
2368
2369     aSig = extractFloat64Frac( a );
2370     aExp = extractFloat64Exp( a );
2371     aSign = extractFloat64Sign( a );
2372     if ( aExp == 0x7FF ) {
2373         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2374         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2375     }
2376     if ( aExp == 0 ) {
2377         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2378         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2379     }
2380     return
2381         packFloatx80(
2382             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2383
2384 }
2385
2386 #endif
2387
2388 #ifdef FLOAT128
2389
2390 /*----------------------------------------------------------------------------
2391 | Returns the result of converting the double-precision floating-point value
2392 | `a' to the quadruple-precision floating-point format.  The conversion is
2393 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2394 | Arithmetic.
2395 *----------------------------------------------------------------------------*/
2396
2397 float128 float64_to_float128( float64 a )
2398 {
2399     flag aSign;
2400     int16 aExp;
2401     bits64 aSig, zSig0, zSig1;
2402
2403     aSig = extractFloat64Frac( a );
2404     aExp = extractFloat64Exp( a );
2405     aSign = extractFloat64Sign( a );
2406     if ( aExp == 0x7FF ) {
2407         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2408         return packFloat128( aSign, 0x7FFF, 0, 0 );
2409     }
2410     if ( aExp == 0 ) {
2411         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2412         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2413         --aExp;
2414     }
2415     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2416     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2417
2418 }
2419
2420 #endif
2421
2422 /*----------------------------------------------------------------------------
2423 | Rounds the double-precision floating-point value `a' to an integer, and
2424 | returns the result as a double-precision floating-point value.  The
2425 | operation is performed according to the IEC/IEEE Standard for Binary
2426 | Floating-Point Arithmetic.
2427 *----------------------------------------------------------------------------*/
2428
2429 float64 float64_round_to_int( float64 a )
2430 {
2431     flag aSign;
2432     int16 aExp;
2433     bits64 lastBitMask, roundBitsMask;
2434     int8 roundingMode;
2435     float64 z;
2436
2437     aExp = extractFloat64Exp( a );
2438     if ( 0x433 <= aExp ) {
2439         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2440             return propagateFloat64NaN( a, a );
2441         }
2442         return a;
2443     }
2444     if ( aExp < 0x3FF ) {
2445         if ( (bits64) ( a<<1 ) == 0 ) return a;
2446         float_exception_flags |= float_flag_inexact;
2447         aSign = extractFloat64Sign( a );
2448         switch ( float_rounding_mode ) {
2449          case float_round_nearest_even:
2450             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2451                 return packFloat64( aSign, 0x3FF, 0 );
2452             }
2453             break;
2454          case float_round_down:
2455             return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2456          case float_round_up:
2457             return
2458             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2459         }
2460         return packFloat64( aSign, 0, 0 );
2461     }
2462     lastBitMask = 1;
2463     lastBitMask <<= 0x433 - aExp;
2464     roundBitsMask = lastBitMask - 1;
2465     z = a;
2466     roundingMode = float_rounding_mode;
2467     if ( roundingMode == float_round_nearest_even ) {
2468         z += lastBitMask>>1;
2469         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2470     }
2471     else if ( roundingMode != float_round_to_zero ) {
2472         if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2473             z += roundBitsMask;
2474         }
2475     }
2476     z &= ~ roundBitsMask;
2477     if ( z != a ) float_exception_flags |= float_flag_inexact;
2478     return z;
2479
2480 }
2481
2482 /*----------------------------------------------------------------------------
2483 | Returns the result of adding the absolute values of the double-precision
2484 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
2485 | before being returned.  `zSign' is ignored if the result is a NaN.
2486 | The addition is performed according to the IEC/IEEE Standard for Binary
2487 | Floating-Point Arithmetic.
2488 *----------------------------------------------------------------------------*/
2489
2490 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2491 {
2492     int16 aExp, bExp, zExp;
2493     bits64 aSig, bSig, zSig;
2494     int16 expDiff;
2495
2496     aSig = extractFloat64Frac( a );
2497     aExp = extractFloat64Exp( a );
2498     bSig = extractFloat64Frac( b );
2499     bExp = extractFloat64Exp( b );
2500     expDiff = aExp - bExp;
2501     aSig <<= 9;
2502     bSig <<= 9;
2503     if ( 0 < expDiff ) {
2504         if ( aExp == 0x7FF ) {
2505             if ( aSig ) return propagateFloat64NaN( a, b );
2506             return a;
2507         }
2508         if ( bExp == 0 ) {
2509             --expDiff;
2510         }
2511         else {
2512             bSig |= LIT64( 0x2000000000000000 );
2513         }
2514         shift64RightJamming( bSig, expDiff, &bSig );
2515         zExp = aExp;
2516     }
2517     else if ( expDiff < 0 ) {
2518         if ( bExp == 0x7FF ) {
2519             if ( bSig ) return propagateFloat64NaN( a, b );
2520             return packFloat64( zSign, 0x7FF, 0 );
2521         }
2522         if ( aExp == 0 ) {
2523             ++expDiff;
2524         }
2525         else {
2526             aSig |= LIT64( 0x2000000000000000 );
2527         }
2528         shift64RightJamming( aSig, - expDiff, &aSig );
2529         zExp = bExp;
2530     }
2531     else {
2532         if ( aExp == 0x7FF ) {
2533             if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2534             return a;
2535         }
2536         if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2537         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2538         zExp = aExp;
2539         goto roundAndPack;
2540     }
2541     aSig |= LIT64( 0x2000000000000000 );
2542     zSig = ( aSig + bSig )<<1;
2543     --zExp;
2544     if ( (sbits64) zSig < 0 ) {
2545         zSig = aSig + bSig;
2546         ++zExp;
2547     }
2548  roundAndPack:
2549     return roundAndPackFloat64( zSign, zExp, zSig );
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of subtracting the absolute values of the double-
2555 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
2556 | difference is negated before being returned.  `zSign' is ignored if the
2557 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
2558 | Standard for Binary Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2560
2561 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2562 {
2563     int16 aExp, bExp, zExp;
2564     bits64 aSig, bSig, zSig;
2565     int16 expDiff;
2566
2567     aSig = extractFloat64Frac( a );
2568     aExp = extractFloat64Exp( a );
2569     bSig = extractFloat64Frac( b );
2570     bExp = extractFloat64Exp( b );
2571     expDiff = aExp - bExp;
2572     aSig <<= 10;
2573     bSig <<= 10;
2574     if ( 0 < expDiff ) goto aExpBigger;
2575     if ( expDiff < 0 ) goto bExpBigger;
2576     if ( aExp == 0x7FF ) {
2577         if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2578         float_raise( float_flag_invalid );
2579         return float64_default_nan;
2580     }
2581     if ( aExp == 0 ) {
2582         aExp = 1;
2583         bExp = 1;
2584     }
2585     if ( bSig < aSig ) goto aBigger;
2586     if ( aSig < bSig ) goto bBigger;
2587     return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2588  bExpBigger:
2589     if ( bExp == 0x7FF ) {
2590         if ( bSig ) return propagateFloat64NaN( a, b );
2591         return packFloat64( zSign ^ 1, 0x7FF, 0 );
2592     }
2593     if ( aExp == 0 ) {
2594         ++expDiff;
2595     }
2596     else {
2597         aSig |= LIT64( 0x4000000000000000 );
2598     }
2599     shift64RightJamming( aSig, - expDiff, &aSig );
2600     bSig |= LIT64( 0x4000000000000000 );
2601  bBigger:
2602     zSig = bSig - aSig;
2603     zExp = bExp;
2604     zSign ^= 1;
2605     goto normalizeRoundAndPack;
2606  aExpBigger:
2607     if ( aExp == 0x7FF ) {
2608         if ( aSig ) return propagateFloat64NaN( a, b );
2609         return a;
2610     }
2611     if ( bExp == 0 ) {
2612         --expDiff;
2613     }
2614     else {
2615         bSig |= LIT64( 0x4000000000000000 );
2616     }
2617     shift64RightJamming( bSig, expDiff, &bSig );
2618     aSig |= LIT64( 0x4000000000000000 );
2619  aBigger:
2620     zSig = aSig - bSig;
2621     zExp = aExp;
2622  normalizeRoundAndPack:
2623     --zExp;
2624     return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2625
2626 }
2627
2628 /*----------------------------------------------------------------------------
2629 | Returns the result of adding the double-precision floating-point values `a'
2630 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
2631 | Binary Floating-Point Arithmetic.
2632 *----------------------------------------------------------------------------*/
2633
2634 float64 float64_add( float64 a, float64 b )
2635 {
2636     flag aSign, bSign;
2637
2638     aSign = extractFloat64Sign( a );
2639     bSign = extractFloat64Sign( b );
2640     if ( aSign == bSign ) {
2641         return addFloat64Sigs( a, b, aSign );
2642     }
2643     else {
2644         return subFloat64Sigs( a, b, aSign );
2645     }
2646
2647 }
2648
2649 /*----------------------------------------------------------------------------
2650 | Returns the result of subtracting the double-precision floating-point values
2651 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2652 | for Binary Floating-Point Arithmetic.
2653 *----------------------------------------------------------------------------*/
2654
2655 float64 float64_sub( float64 a, float64 b )
2656 {
2657     flag aSign, bSign;
2658
2659     aSign = extractFloat64Sign( a );
2660     bSign = extractFloat64Sign( b );
2661     if ( aSign == bSign ) {
2662         return subFloat64Sigs( a, b, aSign );
2663     }
2664     else {
2665         return addFloat64Sigs( a, b, aSign );
2666     }
2667
2668 }
2669
2670 /*----------------------------------------------------------------------------
2671 | Returns the result of multiplying the double-precision floating-point values
2672 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
2673 | for Binary Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2675
2676 float64 float64_mul( float64 a, float64 b )
2677 {
2678     flag aSign, bSign, zSign;
2679     int16 aExp, bExp, zExp;
2680     bits64 aSig, bSig, zSig0, zSig1;
2681
2682     aSig = extractFloat64Frac( a );
2683     aExp = extractFloat64Exp( a );
2684     aSign = extractFloat64Sign( a );
2685     bSig = extractFloat64Frac( b );
2686     bExp = extractFloat64Exp( b );
2687     bSign = extractFloat64Sign( b );
2688     zSign = aSign ^ bSign;
2689     if ( aExp == 0x7FF ) {
2690         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2691             return propagateFloat64NaN( a, b );
2692         }
2693         if ( ( bExp | bSig ) == 0 ) {
2694             float_raise( float_flag_invalid );
2695             return float64_default_nan;
2696         }
2697         return packFloat64( zSign, 0x7FF, 0 );
2698     }
2699     if ( bExp == 0x7FF ) {
2700         if ( bSig ) return propagateFloat64NaN( a, b );
2701         if ( ( aExp | aSig ) == 0 ) {
2702             float_raise( float_flag_invalid );
2703             return float64_default_nan;
2704         }
2705         return packFloat64( zSign, 0x7FF, 0 );
2706     }
2707     if ( aExp == 0 ) {
2708         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2709         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2710     }
2711     if ( bExp == 0 ) {
2712         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2713         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2714     }
2715     zExp = aExp + bExp - 0x3FF;
2716     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2717     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2718     mul64To128( aSig, bSig, &zSig0, &zSig1 );
2719     zSig0 |= ( zSig1 != 0 );
2720     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2721         zSig0 <<= 1;
2722         --zExp;
2723     }
2724     return roundAndPackFloat64( zSign, zExp, zSig0 );
2725
2726 }
2727
2728 /*----------------------------------------------------------------------------
2729 | Returns the result of dividing the double-precision floating-point value `a'
2730 | by the corresponding value `b'.  The operation is performed according to
2731 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2732 *----------------------------------------------------------------------------*/
2733
2734 float64 float64_div( float64 a, float64 b )
2735 {
2736     flag aSign, bSign, zSign;
2737     int16 aExp, bExp, zExp;
2738     bits64 aSig, bSig, zSig;
2739     bits64 rem0, rem1;
2740     bits64 term0, term1;
2741
2742     aSig = extractFloat64Frac( a );
2743     aExp = extractFloat64Exp( a );
2744     aSign = extractFloat64Sign( a );
2745     bSig = extractFloat64Frac( b );
2746     bExp = extractFloat64Exp( b );
2747     bSign = extractFloat64Sign( b );
2748     zSign = aSign ^ bSign;
2749     if ( aExp == 0x7FF ) {
2750         if ( aSig ) return propagateFloat64NaN( a, b );
2751         if ( bExp == 0x7FF ) {
2752             if ( bSig ) return propagateFloat64NaN( a, b );
2753             float_raise( float_flag_invalid );
2754             return float64_default_nan;
2755         }
2756         return packFloat64( zSign, 0x7FF, 0 );
2757     }
2758     if ( bExp == 0x7FF ) {
2759         if ( bSig ) return propagateFloat64NaN( a, b );
2760         return packFloat64( zSign, 0, 0 );
2761     }
2762     if ( bExp == 0 ) {
2763         if ( bSig == 0 ) {
2764             if ( ( aExp | aSig ) == 0 ) {
2765                 float_raise( float_flag_invalid );
2766                 return float64_default_nan;
2767             }
2768             float_raise( float_flag_divbyzero );
2769             return packFloat64( zSign, 0x7FF, 0 );
2770         }
2771         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2772     }
2773     if ( aExp == 0 ) {
2774         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2775         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2776     }
2777     zExp = aExp - bExp + 0x3FD;
2778     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780     if ( bSig <= ( aSig + aSig ) ) {
2781         aSig >>= 1;
2782         ++zExp;
2783     }
2784     zSig = estimateDiv128To64( aSig, 0, bSig );
2785     if ( ( zSig & 0x1FF ) <= 2 ) {
2786         mul64To128( bSig, zSig, &term0, &term1 );
2787         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2788         while ( (sbits64) rem0 < 0 ) {
2789             --zSig;
2790             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2791         }
2792         zSig |= ( rem1 != 0 );
2793     }
2794     return roundAndPackFloat64( zSign, zExp, zSig );
2795
2796 }
2797
2798 /*----------------------------------------------------------------------------
2799 | Returns the remainder of the double-precision floating-point value `a'
2800 | with respect to the corresponding value `b'.  The operation is performed
2801 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2802 *----------------------------------------------------------------------------*/
2803
2804 float64 float64_rem( float64 a, float64 b )
2805 {
2806     flag aSign, bSign, zSign;
2807     int16 aExp, bExp, expDiff;
2808     bits64 aSig, bSig;
2809     bits64 q, alternateASig;
2810     sbits64 sigMean;
2811
2812     aSig = extractFloat64Frac( a );
2813     aExp = extractFloat64Exp( a );
2814     aSign = extractFloat64Sign( a );
2815     bSig = extractFloat64Frac( b );
2816     bExp = extractFloat64Exp( b );
2817     bSign = extractFloat64Sign( b );
2818     if ( aExp == 0x7FF ) {
2819         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2820             return propagateFloat64NaN( a, b );
2821         }
2822         float_raise( float_flag_invalid );
2823         return float64_default_nan;
2824     }
2825     if ( bExp == 0x7FF ) {
2826         if ( bSig ) return propagateFloat64NaN( a, b );
2827         return a;
2828     }
2829     if ( bExp == 0 ) {
2830         if ( bSig == 0 ) {
2831             float_raise( float_flag_invalid );
2832             return float64_default_nan;
2833         }
2834         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835     }
2836     if ( aExp == 0 ) {
2837         if ( aSig == 0 ) return a;
2838         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2839     }
2840     expDiff = aExp - bExp;
2841     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2842     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2843     if ( expDiff < 0 ) {
2844         if ( expDiff < -1 ) return a;
2845         aSig >>= 1;
2846     }
2847     q = ( bSig <= aSig );
2848     if ( q ) aSig -= bSig;
2849     expDiff -= 64;
2850     while ( 0 < expDiff ) {
2851         q = estimateDiv128To64( aSig, 0, bSig );
2852         q = ( 2 < q ) ? q - 2 : 0;
2853         aSig = - ( ( bSig>>2 ) * q );
2854         expDiff -= 62;
2855     }
2856     expDiff += 64;
2857     if ( 0 < expDiff ) {
2858         q = estimateDiv128To64( aSig, 0, bSig );
2859         q = ( 2 < q ) ? q - 2 : 0;
2860         q >>= 64 - expDiff;
2861         bSig >>= 2;
2862         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2863     }
2864     else {
2865         aSig >>= 2;
2866         bSig >>= 2;
2867     }
2868     do {
2869         alternateASig = aSig;
2870         ++q;
2871         aSig -= bSig;
2872     } while ( 0 <= (sbits64) aSig );
2873     sigMean = aSig + alternateASig;
2874     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2875         aSig = alternateASig;
2876     }
2877     zSign = ( (sbits64) aSig < 0 );
2878     if ( zSign ) aSig = - aSig;
2879     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2880
2881 }
2882
2883 /*----------------------------------------------------------------------------
2884 | Returns the square root of the double-precision floating-point value `a'.
2885 | The operation is performed according to the IEC/IEEE Standard for Binary
2886 | Floating-Point Arithmetic.
2887 *----------------------------------------------------------------------------*/
2888
2889 float64 float64_sqrt( float64 a )
2890 {
2891     flag aSign;
2892     int16 aExp, zExp;
2893     bits64 aSig, zSig, doubleZSig;
2894     bits64 rem0, rem1, term0, term1;
2895     float64 z;
2896
2897     aSig = extractFloat64Frac( a );
2898     aExp = extractFloat64Exp( a );
2899     aSign = extractFloat64Sign( a );
2900     if ( aExp == 0x7FF ) {
2901         if ( aSig ) return propagateFloat64NaN( a, a );
2902         if ( ! aSign ) return a;
2903         float_raise( float_flag_invalid );
2904         return float64_default_nan;
2905     }
2906     if ( aSign ) {
2907         if ( ( aExp | aSig ) == 0 ) return a;
2908         float_raise( float_flag_invalid );
2909         return float64_default_nan;
2910     }
2911     if ( aExp == 0 ) {
2912         if ( aSig == 0 ) return 0;
2913         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2914     }
2915     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2916     aSig |= LIT64( 0x0010000000000000 );
2917     zSig = estimateSqrt32( aExp, aSig>>21 );
2918     aSig <<= 9 - ( aExp & 1 );
2919     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2920     if ( ( zSig & 0x1FF ) <= 5 ) {
2921         doubleZSig = zSig<<1;
2922         mul64To128( zSig, zSig, &term0, &term1 );
2923         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2924         while ( (sbits64) rem0 < 0 ) {
2925             --zSig;
2926             doubleZSig -= 2;
2927             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2928         }
2929         zSig |= ( ( rem0 | rem1 ) != 0 );
2930     }
2931     return roundAndPackFloat64( 0, zExp, zSig );
2932
2933 }
2934
2935 /*----------------------------------------------------------------------------
2936 | Returns 1 if the double-precision floating-point value `a' is equal to the
2937 | corresponding value `b', and 0 otherwise.  The comparison is performed
2938 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2939 *----------------------------------------------------------------------------*/
2940
2941 flag float64_eq( float64 a, float64 b )
2942 {
2943
2944     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2945          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2946        ) {
2947         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2948             float_raise( float_flag_invalid );
2949         }
2950         return 0;
2951     }
2952     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2953
2954 }
2955
2956 /*----------------------------------------------------------------------------
2957 | Returns 1 if the double-precision floating-point value `a' is less than or
2958 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
2959 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2960 | Arithmetic.
2961 *----------------------------------------------------------------------------*/
2962
2963 flag float64_le( float64 a, float64 b )
2964 {
2965     flag aSign, bSign;
2966
2967     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2968          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2969        ) {
2970         float_raise( float_flag_invalid );
2971         return 0;
2972     }
2973     aSign = extractFloat64Sign( a );
2974     bSign = extractFloat64Sign( b );
2975     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2976     return ( a == b ) || ( aSign ^ ( a < b ) );
2977
2978 }
2979
2980 /*----------------------------------------------------------------------------
2981 | Returns 1 if the double-precision floating-point value `a' is less than
2982 | the corresponding value `b', and 0 otherwise.  The comparison is performed
2983 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2984 *----------------------------------------------------------------------------*/
2985
2986 flag float64_lt( float64 a, float64 b )
2987 {
2988     flag aSign, bSign;
2989
2990     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2991          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2992        ) {
2993         float_raise( float_flag_invalid );
2994         return 0;
2995     }
2996     aSign = extractFloat64Sign( a );
2997     bSign = extractFloat64Sign( b );
2998     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2999     return ( a != b ) && ( aSign ^ ( a < b ) );
3000
3001 }
3002
3003 /*----------------------------------------------------------------------------
3004 | Returns 1 if the double-precision floating-point value `a' is equal to the
3005 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
3006 | if either operand is a NaN.  Otherwise, the comparison is performed
3007 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3008 *----------------------------------------------------------------------------*/
3009
3010 flag float64_eq_signaling( float64 a, float64 b )
3011 {
3012
3013     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3014          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3015        ) {
3016         float_raise( float_flag_invalid );
3017         return 0;
3018     }
3019     return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3020
3021 }
3022
3023 /*----------------------------------------------------------------------------
3024 | Returns 1 if the double-precision floating-point value `a' is less than or
3025 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
3026 | cause an exception.  Otherwise, the comparison is performed according to the
3027 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3028 *----------------------------------------------------------------------------*/
3029
3030 flag float64_le_quiet( float64 a, float64 b )
3031 {
3032     flag aSign, bSign;
3033     int16 aExp, bExp;
3034
3035     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3036          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3037        ) {
3038         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3039             float_raise( float_flag_invalid );
3040         }
3041         return 0;
3042     }
3043     aSign = extractFloat64Sign( a );
3044     bSign = extractFloat64Sign( b );
3045     if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3046     return ( a == b ) || ( aSign ^ ( a < b ) );
3047
3048 }
3049
3050 /*----------------------------------------------------------------------------
3051 | Returns 1 if the double-precision floating-point value `a' is less than
3052 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
3053 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
3054 | Standard for Binary Floating-Point Arithmetic.
3055 *----------------------------------------------------------------------------*/
3056
3057 flag float64_lt_quiet( float64 a, float64 b )
3058 {
3059     flag aSign, bSign;
3060
3061     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3062          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3063        ) {
3064         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3065             float_raise( float_flag_invalid );
3066         }
3067         return 0;
3068     }
3069     aSign = extractFloat64Sign( a );
3070     bSign = extractFloat64Sign( b );
3071     if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3072     return ( a != b ) && ( aSign ^ ( a < b ) );
3073
3074 }
3075
3076 #ifdef FLOATX80
3077
3078 /*----------------------------------------------------------------------------
3079 | Returns the result of converting the extended double-precision floating-
3080 | point value `a' to the 32-bit two's complement integer format.  The
3081 | conversion is performed according to the IEC/IEEE Standard for Binary
3082 | Floating-Point Arithmetic---which means in particular that the conversion
3083 | is rounded according to the current rounding mode.  If `a' is a NaN, the
3084 | largest positive integer is returned.  Otherwise, if the conversion
3085 | overflows, the largest integer with the same sign as `a' is returned.
3086 *----------------------------------------------------------------------------*/
3087
3088 int32 floatx80_to_int32( floatx80 a )
3089 {
3090     flag aSign;
3091     int32 aExp, shiftCount;
3092     bits64 aSig;
3093
3094     aSig = extractFloatx80Frac( a );
3095     aExp = extractFloatx80Exp( a );
3096     aSign = extractFloatx80Sign( a );
3097     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3098     shiftCount = 0x4037 - aExp;
3099     if ( shiftCount <= 0 ) shiftCount = 1;
3100     shift64RightJamming( aSig, shiftCount, &aSig );
3101     return roundAndPackInt32( aSign, aSig );
3102
3103 }
3104
3105 /*----------------------------------------------------------------------------
3106 | Returns the result of converting the extended double-precision floating-
3107 | point value `a' to the 32-bit two's complement integer format.  The
3108 | conversion is performed according to the IEC/IEEE Standard for Binary
3109 | Floating-Point Arithmetic, except that the conversion is always rounded
3110 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3111 | Otherwise, if the conversion overflows, the largest integer with the same
3112 | sign as `a' is returned.
3113 *----------------------------------------------------------------------------*/
3114
3115 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3116 {
3117     flag aSign;
3118     int32 aExp, shiftCount;
3119     bits64 aSig, savedASig;
3120     int32 z;
3121
3122     aSig = extractFloatx80Frac( a );
3123     aExp = extractFloatx80Exp( a );
3124     aSign = extractFloatx80Sign( a );
3125     if ( 0x401E < aExp ) {
3126         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3127         goto invalid;
3128     }
3129     else if ( aExp < 0x3FFF ) {
3130         if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3131         return 0;
3132     }
3133     shiftCount = 0x403E - aExp;
3134     savedASig = aSig;
3135     aSig >>= shiftCount;
3136     z = aSig;
3137     if ( aSign ) z = - z;
3138     if ( ( z < 0 ) ^ aSign ) {
3139  invalid:
3140         float_raise( float_flag_invalid );
3141         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3142     }
3143     if ( ( aSig<<shiftCount ) != savedASig ) {
3144         float_exception_flags |= float_flag_inexact;
3145     }
3146     return z;
3147
3148 }
3149
3150 /*----------------------------------------------------------------------------
3151 | Returns the result of converting the extended double-precision floating-
3152 | point value `a' to the 64-bit two's complement integer format.  The
3153 | conversion is performed according to the IEC/IEEE Standard for Binary
3154 | Floating-Point Arithmetic---which means in particular that the conversion
3155 | is rounded according to the current rounding mode.  If `a' is a NaN,
3156 | the largest positive integer is returned.  Otherwise, if the conversion
3157 | overflows, the largest integer with the same sign as `a' is returned.
3158 *----------------------------------------------------------------------------*/
3159
3160 int64 floatx80_to_int64( floatx80 a )
3161 {
3162     flag aSign;
3163     int32 aExp, shiftCount;
3164     bits64 aSig, aSigExtra;
3165
3166     aSig = extractFloatx80Frac( a );
3167     aExp = extractFloatx80Exp( a );
3168     aSign = extractFloatx80Sign( a );
3169     shiftCount = 0x403E - aExp;
3170     if ( shiftCount <= 0 ) {
3171         if ( shiftCount ) {
3172             float_raise( float_flag_invalid );
3173             if (    ! aSign
3174                  || (    ( aExp == 0x7FFF )
3175                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
3176                ) {
3177                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3178             }
3179             return (sbits64) LIT64( 0x8000000000000000 );
3180         }
3181         aSigExtra = 0;
3182     }
3183     else {
3184         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3185     }
3186     return roundAndPackInt64( aSign, aSig, aSigExtra );
3187
3188 }
3189
3190 /*----------------------------------------------------------------------------
3191 | Returns the result of converting the extended double-precision floating-
3192 | point value `a' to the 64-bit two's complement integer format.  The
3193 | conversion is performed according to the IEC/IEEE Standard for Binary
3194 | Floating-Point Arithmetic, except that the conversion is always rounded
3195 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
3196 | Otherwise, if the conversion overflows, the largest integer with the same
3197 | sign as `a' is returned.
3198 *----------------------------------------------------------------------------*/
3199
3200 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3201 {
3202     flag aSign;
3203     int32 aExp, shiftCount;
3204     bits64 aSig;
3205     int64 z;
3206
3207     aSig = extractFloatx80Frac( a );
3208     aExp = extractFloatx80Exp( a );
3209     aSign = extractFloatx80Sign( a );
3210     shiftCount = aExp - 0x403E;
3211     if ( 0 <= shiftCount ) {
3212         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3213         if ( ( a.high != 0xC03E ) || aSig ) {
3214             float_raise( float_flag_invalid );
3215             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3216                 return LIT64( 0x7FFFFFFFFFFFFFFF );
3217             }
3218         }
3219         return (sbits64) LIT64( 0x8000000000000000 );
3220     }
3221     else if ( aExp < 0x3FFF ) {
3222         if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3223         return 0;
3224     }
3225     z = aSig>>( - shiftCount );
3226     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3227         float_exception_flags |= float_flag_inexact;
3228     }
3229     if ( aSign ) z = - z;
3230     return z;
3231
3232 }
3233
3234 /*----------------------------------------------------------------------------
3235 | Returns the result of converting the extended double-precision floating-
3236 | point value `a' to the single-precision floating-point format.  The
3237 | conversion is performed according to the IEC/IEEE Standard for Binary
3238 | Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3240
3241 float32 floatx80_to_float32( floatx80 a )
3242 {
3243     flag aSign;
3244     int32 aExp;
3245     bits64 aSig;
3246
3247     aSig = extractFloatx80Frac( a );
3248     aExp = extractFloatx80Exp( a );
3249     aSign = extractFloatx80Sign( a );
3250     if ( aExp == 0x7FFF ) {
3251         if ( (bits64) ( aSig<<1 ) ) {
3252             return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3253         }
3254         return packFloat32( aSign, 0xFF, 0 );
3255     }
3256     shift64RightJamming( aSig, 33, &aSig );
3257     if ( aExp || aSig ) aExp -= 0x3F81;
3258     return roundAndPackFloat32( aSign, aExp, aSig );
3259
3260 }
3261
3262 /*----------------------------------------------------------------------------
3263 | Returns the result of converting the extended double-precision floating-
3264 | point value `a' to the double-precision floating-point format.  The
3265 | conversion is performed according to the IEC/IEEE Standard for Binary
3266 | Floating-Point Arithmetic.
3267 *----------------------------------------------------------------------------*/
3268
3269 float64 floatx80_to_float64( floatx80 a )
3270 {
3271     flag aSign;
3272     int32 aExp;
3273     bits64 aSig, zSig;
3274
3275     aSig = extractFloatx80Frac( a );
3276     aExp = extractFloatx80Exp( a );
3277     aSign = extractFloatx80Sign( a );
3278     if ( aExp == 0x7FFF ) {
3279         if ( (bits64) ( aSig<<1 ) ) {
3280             return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3281         }
3282         return packFloat64( aSign, 0x7FF, 0 );
3283     }
3284     shift64RightJamming( aSig, 1, &zSig );
3285     if ( aExp || aSig ) aExp -= 0x3C01;
3286     return roundAndPackFloat64( aSign, aExp, zSig );
3287
3288 }
3289
3290 #ifdef FLOAT128
3291
3292 /*----------------------------------------------------------------------------
3293 | Returns the result of converting the extended double-precision floating-
3294 | point value `a' to the quadruple-precision floating-point format.  The
3295 | conversion is performed according to the IEC/IEEE Standard for Binary
3296 | Floating-Point Arithmetic.
3297 *----------------------------------------------------------------------------*/
3298
3299 float128 floatx80_to_float128( floatx80 a )
3300 {
3301     flag aSign;
3302     int16 aExp;
3303     bits64 aSig, zSig0, zSig1;
3304
3305     aSig = extractFloatx80Frac( a );
3306     aExp = extractFloatx80Exp( a );
3307     aSign = extractFloatx80Sign( a );
3308     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3309         return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3310     }
3311     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3312     return packFloat128( aSign, aExp, zSig0, zSig1 );
3313
3314 }
3315
3316 #endif
3317
3318 /*----------------------------------------------------------------------------
3319 | Rounds the extended double-precision floating-point value `a' to an integer,
3320 | and returns the result as an extended quadruple-precision floating-point
3321 | value.  The operation is performed according to the IEC/IEEE Standard for
3322 | Binary Floating-Point Arithmetic.
3323 *----------------------------------------------------------------------------*/
3324
3325 floatx80 floatx80_round_to_int( floatx80 a )
3326 {
3327     flag aSign;
3328     int32 aExp;
3329     bits64 lastBitMask, roundBitsMask;
3330     int8 roundingMode;
3331     floatx80 z;
3332
3333     aExp = extractFloatx80Exp( a );
3334     if ( 0x403E <= aExp ) {
3335         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3336             return propagateFloatx80NaN( a, a );
3337         }
3338         return a;
3339     }
3340     if ( aExp < 0x3FFF ) {
3341         if (    ( aExp == 0 )
3342              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3343             return a;
3344         }
3345         float_exception_flags |= float_flag_inexact;
3346         aSign = extractFloatx80Sign( a );
3347         switch ( float_rounding_mode ) {
3348          case float_round_nearest_even:
3349             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3350                ) {
3351                 return
3352                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3353             }
3354             break;
3355          case float_round_down:
3356             return
3357                   aSign ?
3358                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3359                 : packFloatx80( 0, 0, 0 );
3360          case float_round_up:
3361             return
3362                   aSign ? packFloatx80( 1, 0, 0 )
3363                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3364         }
3365         return packFloatx80( aSign, 0, 0 );
3366     }
3367     lastBitMask = 1;
3368     lastBitMask <<= 0x403E - aExp;
3369     roundBitsMask = lastBitMask - 1;
3370     z = a;
3371     roundingMode = float_rounding_mode;
3372     if ( roundingMode == float_round_nearest_even ) {
3373         z.low += lastBitMask>>1;
3374         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3375     }
3376     else if ( roundingMode != float_round_to_zero ) {
3377         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3378             z.low += roundBitsMask;
3379         }
3380     }
3381     z.low &= ~ roundBitsMask;
3382     if ( z.low == 0 ) {
3383         ++z.high;
3384         z.low = LIT64( 0x8000000000000000 );
3385     }
3386     if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3387     return z;
3388
3389 }
3390
3391 /*----------------------------------------------------------------------------
3392 | Returns the result of adding the absolute values of the extended double-
3393 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
3394 | negated before being returned.  `zSign' is ignored if the result is a NaN.
3395 | The addition is performed according to the IEC/IEEE Standard for Binary
3396 | Floating-Point Arithmetic.
3397 *----------------------------------------------------------------------------*/
3398
3399 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3400 {
3401     int32 aExp, bExp, zExp;
3402     bits64 aSig, bSig, zSig0, zSig1;
3403     int32 expDiff;
3404
3405     aSig = extractFloatx80Frac( a );
3406     aExp = extractFloatx80Exp( a );
3407     bSig = extractFloatx80Frac( b );
3408     bExp = extractFloatx80Exp( b );
3409     expDiff = aExp - bExp;
3410     if ( 0 < expDiff ) {
3411         if ( aExp == 0x7FFF ) {
3412             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3413             return a;
3414         }
3415         if ( bExp == 0 ) --expDiff;
3416         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3417         zExp = aExp;
3418     }
3419     else if ( expDiff < 0 ) {
3420         if ( bExp == 0x7FFF ) {
3421             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3422             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3423         }
3424         if ( aExp == 0 ) ++expDiff;
3425         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3426         zExp = bExp;
3427     }
3428     else {
3429         if ( aExp == 0x7FFF ) {
3430             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3431                 return propagateFloatx80NaN( a, b );
3432             }
3433             return a;
3434         }
3435         zSig1 = 0;
3436         zSig0 = aSig + bSig;
3437         if ( aExp == 0 ) {
3438             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3439             goto roundAndPack;
3440         }
3441         zExp = aExp;
3442         goto shiftRight1;
3443     }
3444     zSig0 = aSig + bSig;
3445     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3446  shiftRight1:
3447     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3448     zSig0 |= LIT64( 0x8000000000000000 );
3449     ++zExp;
3450  roundAndPack:
3451     return
3452         roundAndPackFloatx80(
3453             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3454
3455 }
3456
3457 /*----------------------------------------------------------------------------
3458 | Returns the result of subtracting the absolute values of the extended
3459 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
3460 | difference is negated before being returned.  `zSign' is ignored if the
3461 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
3462 | Standard for Binary Floating-Point Arithmetic.
3463 *----------------------------------------------------------------------------*/
3464
3465 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3466 {
3467     int32 aExp, bExp, zExp;
3468     bits64 aSig, bSig, zSig0, zSig1;
3469     int32 expDiff;
3470     floatx80 z;
3471
3472     aSig = extractFloatx80Frac( a );
3473     aExp = extractFloatx80Exp( a );
3474     bSig = extractFloatx80Frac( b );
3475     bExp = extractFloatx80Exp( b );
3476     expDiff = aExp - bExp;
3477     if ( 0 < expDiff ) goto aExpBigger;
3478     if ( expDiff < 0 ) goto bExpBigger;
3479     if ( aExp == 0x7FFF ) {
3480         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3481             return propagateFloatx80NaN( a, b );
3482         }
3483         float_raise( float_flag_invalid );
3484         z.low = floatx80_default_nan_low;
3485         z.high = floatx80_default_nan_high;
3486         return z;
3487     }
3488     if ( aExp == 0 ) {
3489         aExp = 1;
3490         bExp = 1;
3491     }
3492     zSig1 = 0;
3493     if ( bSig < aSig ) goto aBigger;
3494     if ( aSig < bSig ) goto bBigger;
3495     return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3496  bExpBigger:
3497     if ( bExp == 0x7FFF ) {
3498         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3499         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3500     }
3501     if ( aExp == 0 ) ++expDiff;
3502     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3503  bBigger:
3504     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3505     zExp = bExp;
3506     zSign ^= 1;
3507     goto normalizeRoundAndPack;
3508  aExpBigger:
3509     if ( aExp == 0x7FFF ) {
3510         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3511         return a;
3512     }
3513     if ( bExp == 0 ) --expDiff;
3514     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3515  aBigger:
3516     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3517     zExp = aExp;
3518  normalizeRoundAndPack:
3519     return
3520         normalizeRoundAndPackFloatx80(
3521             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3522
3523 }
3524
3525 /*----------------------------------------------------------------------------
3526 | Returns the result of adding the extended double-precision floating-point
3527 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
3528 | Standard for Binary Floating-Point Arithmetic.
3529 *----------------------------------------------------------------------------*/
3530
3531 floatx80 floatx80_add( floatx80 a, floatx80 b )
3532 {
3533     flag aSign, bSign;
3534
3535     aSign = extractFloatx80Sign( a );
3536     bSign = extractFloatx80Sign( b );
3537     if ( aSign == bSign ) {
3538         return addFloatx80Sigs( a, b, aSign );
3539     }
3540     else {
3541         return subFloatx80Sigs( a, b, aSign );
3542     }
3543
3544 }
3545
3546 /*----------------------------------------------------------------------------
3547 | Returns the result of subtracting the extended double-precision floating-
3548 | point values `a' and `b'.  The operation is performed according to the
3549 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3550 *----------------------------------------------------------------------------*/
3551
3552 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3553 {
3554     flag aSign, bSign;
3555
3556     aSign = extractFloatx80Sign( a );
3557     bSign = extractFloatx80Sign( b );
3558     if ( aSign == bSign ) {
3559         return subFloatx80Sigs( a, b, aSign );
3560     }
3561     else {
3562         return addFloatx80Sigs( a, b, aSign );
3563     }
3564
3565 }
3566
3567 /*----------------------------------------------------------------------------
3568 | Returns the result of multiplying the extended double-precision floating-
3569 | point values `a' and `b'.  The operation is performed according to the
3570 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571 *----------------------------------------------------------------------------*/
3572
3573 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3574 {
3575     flag aSign, bSign, zSign;
3576     int32 aExp, bExp, zExp;
3577     bits64 aSig, bSig, zSig0, zSig1;
3578     floatx80 z;
3579
3580     aSig = extractFloatx80Frac( a );
3581     aExp = extractFloatx80Exp( a );
3582     aSign = extractFloatx80Sign( a );
3583     bSig = extractFloatx80Frac( b );
3584     bExp = extractFloatx80Exp( b );
3585     bSign = extractFloatx80Sign( b );
3586     zSign = aSign ^ bSign;
3587     if ( aExp == 0x7FFF ) {
3588         if (    (bits64) ( aSig<<1 )
3589              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3590             return propagateFloatx80NaN( a, b );
3591         }
3592         if ( ( bExp | bSig ) == 0 ) goto invalid;
3593         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3594     }
3595     if ( bExp == 0x7FFF ) {
3596         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3597         if ( ( aExp | aSig ) == 0 ) {
3598  invalid:
3599             float_raise( float_flag_invalid );
3600             z.low = floatx80_default_nan_low;
3601             z.high = floatx80_default_nan_high;
3602             return z;
3603         }
3604         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3605     }
3606     if ( aExp == 0 ) {
3607         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3608         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3609     }
3610     if ( bExp == 0 ) {
3611         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3612         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3613     }
3614     zExp = aExp + bExp - 0x3FFE;
3615     mul64To128( aSig, bSig, &zSig0, &zSig1 );
3616     if ( 0 < (sbits64) zSig0 ) {
3617         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3618         --zExp;
3619     }
3620     return
3621         roundAndPackFloatx80(
3622             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3623
3624 }
3625
3626 /*----------------------------------------------------------------------------
3627 | Returns the result of dividing the extended double-precision floating-point
3628 | value `a' by the corresponding value `b'.  The operation is performed
3629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3630 *----------------------------------------------------------------------------*/
3631
3632 floatx80 floatx80_div( floatx80 a, floatx80 b )
3633 {
3634     flag aSign, bSign, zSign;
3635     int32 aExp, bExp, zExp;
3636     bits64 aSig, bSig, zSig0, zSig1;
3637     bits64 rem0, rem1, rem2, term0, term1, term2;
3638     floatx80 z;
3639
3640     aSig = extractFloatx80Frac( a );
3641     aExp = extractFloatx80Exp( a );
3642     aSign = extractFloatx80Sign( a );
3643     bSig = extractFloatx80Frac( b );
3644     bExp = extractFloatx80Exp( b );
3645     bSign = extractFloatx80Sign( b );
3646     zSign = aSign ^ bSign;
3647     if ( aExp == 0x7FFF ) {
3648         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3649         if ( bExp == 0x7FFF ) {
3650             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3651             goto invalid;
3652         }
3653         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3654     }
3655     if ( bExp == 0x7FFF ) {
3656         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3657         return packFloatx80( zSign, 0, 0 );
3658     }
3659     if ( bExp == 0 ) {
3660         if ( bSig == 0 ) {
3661             if ( ( aExp | aSig ) == 0 ) {
3662  invalid:
3663                 float_raise( float_flag_invalid );
3664                 z.low = floatx80_default_nan_low;
3665                 z.high = floatx80_default_nan_high;
3666                 return z;
3667             }
3668             float_raise( float_flag_divbyzero );
3669             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670         }
3671         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3672     }
3673     if ( aExp == 0 ) {
3674         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3675         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3676     }
3677     zExp = aExp - bExp + 0x3FFE;
3678     rem1 = 0;
3679     if ( bSig <= aSig ) {
3680         shift128Right( aSig, 0, 1, &aSig, &rem1 );
3681         ++zExp;
3682     }
3683     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3684     mul64To128( bSig, zSig0, &term0, &term1 );
3685     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3686     while ( (sbits64) rem0 < 0 ) {
3687         --zSig0;
3688         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3689     }
3690     zSig1 = estimateDiv128To64( rem1, 0, bSig );
3691     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3692         mul64To128( bSig, zSig1, &term1, &term2 );
3693         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3694         while ( (sbits64) rem1 < 0 ) {
3695             --zSig1;
3696             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3697         }
3698         zSig1 |= ( ( rem1 | rem2 ) != 0 );
3699     }
3700     return
3701         roundAndPackFloatx80(
3702             floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3703
3704 }
3705
3706 /*----------------------------------------------------------------------------
3707 | Returns the remainder of the extended double-precision floating-point value
3708 | `a' with respect to the corresponding value `b'.  The operation is performed
3709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3710 *----------------------------------------------------------------------------*/
3711
3712 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3713 {
3714     flag aSign, bSign, zSign;
3715     int32 aExp, bExp, expDiff;
3716     bits64 aSig0, aSig1, bSig;
3717     bits64 q, term0, term1, alternateASig0, alternateASig1;
3718     floatx80 z;
3719
3720     aSig0 = extractFloatx80Frac( a );
3721     aExp = extractFloatx80Exp( a );
3722     aSign = extractFloatx80Sign( a );
3723     bSig = extractFloatx80Frac( b );
3724     bExp = extractFloatx80Exp( b );
3725     bSign = extractFloatx80Sign( b );
3726     if ( aExp == 0x7FFF ) {
3727         if (    (bits64) ( aSig0<<1 )
3728              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3729             return propagateFloatx80NaN( a, b );
3730         }
3731         goto invalid;
3732     }
3733     if ( bExp == 0x7FFF ) {
3734         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3735         return a;
3736     }
3737     if ( bExp == 0 ) {
3738         if ( bSig == 0 ) {
3739  invalid:
3740             float_raise( float_flag_invalid );
3741             z.low = floatx80_default_nan_low;
3742             z.high = floatx80_default_nan_high;
3743             return z;
3744         }
3745         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3746     }
3747     if ( aExp == 0 ) {
3748         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3749         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3750     }
3751     bSig |= LIT64( 0x8000000000000000 );
3752     zSign = aSign;
3753     expDiff = aExp - bExp;
3754     aSig1 = 0;
3755     if ( expDiff < 0 ) {
3756         if ( expDiff < -1 ) return a;
3757         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3758         expDiff = 0;
3759     }
3760     q = ( bSig <= aSig0 );
3761     if ( q ) aSig0 -= bSig;
3762     expDiff -= 64;
3763     while ( 0 < expDiff ) {
3764         q = estimateDiv128To64( aSig0, aSig1, bSig );
3765         q = ( 2 < q ) ? q - 2 : 0;
3766         mul64To128( bSig, q, &term0, &term1 );
3767         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3768         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3769         expDiff -= 62;
3770     }
3771     expDiff += 64;
3772     if ( 0 < expDiff ) {
3773         q = estimateDiv128To64( aSig0, aSig1, bSig );
3774         q = ( 2 < q ) ? q - 2 : 0;
3775         q >>= 64 - expDiff;
3776         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3777         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3778         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3779         while ( le128( term0, term1, aSig0, aSig1 ) ) {
3780             ++q;
3781             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3782         }
3783     }
3784     else {
3785         term1 = 0;
3786         term0 = bSig;
3787     }
3788     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3789     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3790          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3791               && ( q & 1 ) )
3792        ) {
3793         aSig0 = alternateASig0;
3794         aSig1 = alternateASig1;
3795         zSign = ! zSign;
3796     }
3797     return
3798         normalizeRoundAndPackFloatx80(
3799             80, zSign, bExp + expDiff, aSig0, aSig1 );
3800
3801 }
3802
3803 /*----------------------------------------------------------------------------
3804 | Returns the square root of the extended double-precision floating-point
3805 | value `a'.  The operation is performed according to the IEC/IEEE Standard
3806 | for Binary Floating-Point Arithmetic.
3807 *----------------------------------------------------------------------------*/
3808
3809 floatx80 floatx80_sqrt( floatx80 a )
3810 {
3811     flag aSign;
3812     int32 aExp, zExp;
3813     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3814     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3815     floatx80 z;
3816
3817     aSig0 = extractFloatx80Frac( a );
3818     aExp = extractFloatx80Exp( a );
3819     aSign = extractFloatx80Sign( a );
3820     if ( aExp == 0x7FFF ) {
3821         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3822         if ( ! aSign ) return a;
3823         goto invalid;
3824     }
3825     if ( aSign ) {
3826         if ( ( aExp | aSig0 ) == 0 ) return a;
3827  invalid:
3828         float_raise( float_flag_invalid );
3829         z.low = floatx80_default_nan_low;
3830         z.high = floatx80_default_nan_high;
3831         return z;
3832     }
3833     if ( aExp == 0 ) {
3834         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3835         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3836     }
3837     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3838     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3839     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3840     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3841     doubleZSig0 = zSig0<<1;
3842     mul64To128( zSig0, zSig0, &term0, &term1 );
3843     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3844     while ( (sbits64) rem0 < 0 ) {
3845         --zSig0;
3846         doubleZSig0 -= 2;
3847         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3848     }
3849     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3850     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3851         if ( zSig1 == 0 ) zSig1 = 1;
3852         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3853         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3854         mul64To128( zSig1, zSig1, &term2, &term3 );
3855         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3856         while ( (sbits64) rem1 < 0 ) {
3857             --zSig1;
3858             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3859             term3 |= 1;
3860             term2 |= doubleZSig0;
3861             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3862         }
3863         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3864     }
3865     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3866     zSig0 |= doubleZSig0;
3867     return
3868         roundAndPackFloatx80(
3869             floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3870
3871 }
3872
3873 /*----------------------------------------------------------------------------
3874 | Returns 1 if the extended double-precision floating-point value `a' is
3875 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
3876 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3877 | Arithmetic.
3878 *----------------------------------------------------------------------------*/
3879
3880 flag floatx80_eq( floatx80 a, floatx80 b )
3881 {
3882
3883     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3884               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3885          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3886               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3887        ) {
3888         if (    floatx80_is_signaling_nan( a )
3889              || floatx80_is_signaling_nan( b ) ) {
3890             float_raise( float_flag_invalid );
3891         }
3892         return 0;
3893     }
3894     return
3895            ( a.low == b.low )
3896         && (    ( a.high == b.high )
3897              || (    ( a.low == 0 )
3898                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3899            );
3900
3901 }
3902
3903 /*----------------------------------------------------------------------------
3904 | Returns 1 if the extended double-precision floating-point value `a' is
3905 | less than or equal to the corresponding value `b', and 0 otherwise.  The
3906 | comparison is performed according to the IEC/IEEE Standard for Binary
3907 | Floating-Point Arithmetic.
3908 *----------------------------------------------------------------------------*/
3909
3910 flag floatx80_le( floatx80 a, floatx80 b )
3911 {
3912     flag aSign, bSign;
3913
3914     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3915               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3916          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3917               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3918        ) {
3919         float_raise( float_flag_invalid );
3920         return 0;
3921     }
3922     aSign = extractFloatx80Sign( a );
3923     bSign = extractFloatx80Sign( b );
3924     if ( aSign != bSign ) {
3925         return
3926                aSign
3927             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3928                  == 0 );
3929     }
3930     return
3931           aSign ? le128( b.high, b.low, a.high, a.low )
3932         : le128( a.high, a.low, b.high, b.low );
3933
3934 }
3935
3936 /*----------------------------------------------------------------------------
3937 | Returns 1 if the extended double-precision floating-point value `a' is
3938 | less than the corresponding value `b', and 0 otherwise.  The comparison
3939 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3940 | Arithmetic.
3941 *----------------------------------------------------------------------------*/
3942
3943 flag floatx80_lt( floatx80 a, floatx80 b )
3944 {
3945     flag aSign, bSign;
3946
3947     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3948               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3949          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3950               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3951        ) {
3952         float_raise( float_flag_invalid );
3953         return 0;
3954     }
3955     aSign = extractFloatx80Sign( a );
3956     bSign = extractFloatx80Sign( b );
3957     if ( aSign != bSign ) {
3958         return
3959                aSign
3960             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3961                  != 0 );
3962     }
3963     return
3964           aSign ? lt128( b.high, b.low, a.high, a.low )
3965         : lt128( a.high, a.low, b.high, b.low );
3966
3967 }
3968
3969 /*----------------------------------------------------------------------------
3970 | Returns 1 if the extended double-precision floating-point value `a' is equal
3971 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
3972 | raised if either operand is a NaN.  Otherwise, the comparison is performed
3973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974 *----------------------------------------------------------------------------*/
3975
3976 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3977 {
3978
3979     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
3980               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3981          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
3982               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3983        ) {
3984         float_raise( float_flag_invalid );
3985         return 0;
3986     }
3987     return
3988            ( a.low == b.low )
3989         && (    ( a.high == b.high )
3990              || (    ( a.low == 0 )
3991                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3992            );
3993
3994 }
3995
3996 /*----------------------------------------------------------------------------
3997 | Returns 1 if the extended double-precision floating-point value `a' is less
3998 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
3999 | do not cause an exception.  Otherwise, the comparison is performed according
4000 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4001 *----------------------------------------------------------------------------*/
4002
4003 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4004 {
4005     flag aSign, bSign;
4006
4007     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4008               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4009          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4010               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4011        ) {
4012         if (    floatx80_is_signaling_nan( a )
4013              || floatx80_is_signaling_nan( b ) ) {
4014             float_raise( float_flag_invalid );
4015         }
4016         return 0;
4017     }
4018     aSign = extractFloatx80Sign( a );
4019     bSign = extractFloatx80Sign( b );
4020     if ( aSign != bSign ) {
4021         return
4022                aSign
4023             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4024                  == 0 );
4025     }
4026     return
4027           aSign ? le128( b.high, b.low, a.high, a.low )
4028         : le128( a.high, a.low, b.high, b.low );
4029
4030 }
4031
4032 /*----------------------------------------------------------------------------
4033 | Returns 1 if the extended double-precision floating-point value `a' is less
4034 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
4035 | an exception.  Otherwise, the comparison is performed according to the
4036 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4037 *----------------------------------------------------------------------------*/
4038
4039 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4040 {
4041     flag aSign, bSign;
4042
4043     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
4044               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4045          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
4046               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4047        ) {
4048         if (    floatx80_is_signaling_nan( a )
4049              || floatx80_is_signaling_nan( b ) ) {
4050             float_raise( float_flag_invalid );
4051         }
4052         return 0;
4053     }
4054     aSign = extractFloatx80Sign( a );
4055     bSign = extractFloatx80Sign( b );
4056     if ( aSign != bSign ) {
4057         return
4058                aSign
4059             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4060                  != 0 );
4061     }
4062     return
4063           aSign ? lt128( b.high, b.low, a.high, a.low )
4064         : lt128( a.high, a.low, b.high, b.low );
4065
4066 }
4067
4068 #endif
4069
4070 #ifdef FLOAT128
4071
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the quadruple-precision floating-point
4074 | value `a' to the 32-bit two's complement integer format.  The conversion
4075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4076 | Arithmetic---which means in particular that the conversion is rounded
4077 | according to the current rounding mode.  If `a' is a NaN, the largest
4078 | positive integer is returned.  Otherwise, if the conversion overflows, the
4079 | largest integer with the same sign as `a' is returned.
4080 *----------------------------------------------------------------------------*/
4081
4082 int32 float128_to_int32( float128 a )
4083 {
4084     flag aSign;
4085     int32 aExp, shiftCount;
4086     bits64 aSig0, aSig1;
4087
4088     aSig1 = extractFloat128Frac1( a );
4089     aSig0 = extractFloat128Frac0( a );
4090     aExp = extractFloat128Exp( a );
4091     aSign = extractFloat128Sign( a );
4092     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4093     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4094     aSig0 |= ( aSig1 != 0 );
4095     shiftCount = 0x4028 - aExp;
4096     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4097     return roundAndPackInt32( aSign, aSig0 );
4098
4099 }
4100
4101 /*----------------------------------------------------------------------------
4102 | Returns the result of converting the quadruple-precision floating-point
4103 | value `a' to the 32-bit two's complement integer format.  The conversion
4104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4105 | Arithmetic, except that the conversion is always rounded toward zero.  If
4106 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
4107 | conversion overflows, the largest integer with the same sign as `a' is
4108 | returned.
4109 *----------------------------------------------------------------------------*/
4110
4111 int32 float128_to_int32_round_to_zero( float128 a )
4112 {
4113     flag aSign;
4114     int32 aExp, shiftCount;
4115     bits64 aSig0, aSig1, savedASig;
4116     int32 z;
4117
4118     aSig1 = extractFloat128Frac1( a );
4119     aSig0 = extractFloat128Frac0( a );
4120     aExp = extractFloat128Exp( a );
4121     aSign = extractFloat128Sign( a );
4122     aSig0 |= ( aSig1 != 0 );
4123     if ( 0x401E < aExp ) {
4124         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4125         goto invalid;
4126     }
4127     else if ( aExp < 0x3FFF ) {
4128         if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4129         return 0;
4130     }
4131     aSig0 |= LIT64( 0x0001000000000000 );
4132     shiftCount = 0x402F - aExp;
4133     savedASig = aSig0;
4134     aSig0 >>= shiftCount;
4135     z = aSig0;
4136     if ( aSign ) z = - z;
4137     if ( ( z < 0 ) ^ aSign ) {
4138  invalid:
4139         float_raise( float_flag_invalid );
4140         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4141     }
4142     if ( ( aSig0<<shiftCount ) != savedASig ) {
4143         float_exception_flags |= float_flag_inexact;
4144     }
4145     return z;
4146
4147 }
4148
4149 /*----------------------------------------------------------------------------
4150 | Returns the result of converting the quadruple-precision floating-point
4151 | value `a' to the 64-bit two's complement integer format.  The conversion
4152 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4153 | Arithmetic---which means in particular that the conversion is rounded
4154 | according to the current rounding mode.  If `a' is a NaN, the largest
4155 | positive integer is returned.  Otherwise, if the conversion overflows, the
4156 | largest integer with the same sign as `a' is returned.
4157 *----------------------------------------------------------------------------*/
4158
4159 int64 float128_to_int64( float128 a )
4160 {
4161     flag aSign;
4162     int32 aExp, shiftCount;
4163     bits64 aSig0, aSig1;
4164
4165     aSig1 = extractFloat128Frac1( a );
4166     aSig0 = extractFloat128Frac0( a );
4167     aExp = extractFloat128Exp( a );
4168     aSign = extractFloat128Sign( a );
4169     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4170     shiftCount = 0x402F - aExp;
4171     if ( shiftCount <= 0 ) {
4172         if ( 0x403E < aExp ) {
4173             float_raise( float_flag_invalid );
4174             if (    ! aSign
4175                  || (    ( aExp == 0x7FFF )
4176                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4177                     )
4178                ) {
4179                 return LIT64( 0x7FFFFFFFFFFFFFFF );
4180             }
4181             return (sbits64) LIT64( 0x8000000000000000 );
4182         }
4183         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4184     }
4185     else {
4186         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4187     }
4188     return roundAndPackInt64( aSign, aSig0, aSig1 );
4189
4190 }
4191
4192 /*----------------------------------------------------------------------------
4193 | Returns the result of converting the quadruple-precision floating-point
4194 | value `a' to the 64-bit two's complement integer format.  The conversion
4195 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4196 | Arithmetic, except that the conversion is always rounded toward zero.
4197 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
4198 | the conversion overflows, the largest integer with the same sign as `a' is
4199 | returned.
4200 *----------------------------------------------------------------------------*/
4201
4202 int64 float128_to_int64_round_to_zero( float128 a )
4203 {
4204     flag aSign;
4205     int32 aExp, shiftCount;
4206     bits64 aSig0, aSig1;
4207     int64 z;
4208
4209     aSig1 = extractFloat128Frac1( a );
4210     aSig0 = extractFloat128Frac0( a );
4211     aExp = extractFloat128Exp( a );
4212     aSign = extractFloat128Sign( a );
4213     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4214     shiftCount = aExp - 0x402F;
4215     if ( 0 < shiftCount ) {
4216         if ( 0x403E <= aExp ) {
4217             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4218             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
4219                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4220                 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4221             }
4222             else {
4223                 float_raise( float_flag_invalid );
4224                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4225                     return LIT64( 0x7FFFFFFFFFFFFFFF );
4226                 }
4227             }
4228             return (sbits64) LIT64( 0x8000000000000000 );
4229         }
4230         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4231         if ( (bits64) ( aSig1<<shiftCount ) ) {
4232             float_exception_flags |= float_flag_inexact;
4233         }
4234     }
4235     else {
4236         if ( aExp < 0x3FFF ) {
4237             if ( aExp | aSig0 | aSig1 ) {
4238                 float_exception_flags |= float_flag_inexact;
4239             }
4240             return 0;
4241         }
4242         z = aSig0>>( - shiftCount );
4243         if (    aSig1
4244              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4245             float_exception_flags |= float_flag_inexact;
4246         }
4247     }
4248     if ( aSign ) z = - z;
4249     return z;
4250
4251 }
4252
4253 /*----------------------------------------------------------------------------
4254 | Returns the result of converting the quadruple-precision floating-point
4255 | value `a' to the single-precision floating-point format.  The conversion
4256 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4257 | Arithmetic.
4258 *----------------------------------------------------------------------------*/
4259
4260 float32 float128_to_float32( float128 a )
4261 {
4262     flag aSign;
4263     int32 aExp;
4264     bits64 aSig0, aSig1;
4265     bits32 zSig;
4266
4267     aSig1 = extractFloat128Frac1( a );
4268     aSig0 = extractFloat128Frac0( a );
4269     aExp = extractFloat128Exp( a );
4270     aSign = extractFloat128Sign( a );
4271     if ( aExp == 0x7FFF ) {
4272         if ( aSig0 | aSig1 ) {
4273             return commonNaNToFloat32( float128ToCommonNaN( a ) );
4274         }
4275         return packFloat32( aSign, 0xFF, 0 );
4276     }
4277     aSig0 |= ( aSig1 != 0 );
4278     shift64RightJamming( aSig0, 18, &aSig0 );
4279     zSig = aSig0;
4280     if ( aExp || zSig ) {
4281         zSig |= 0x40000000;
4282         aExp -= 0x3F81;
4283     }
4284     return roundAndPackFloat32( aSign, aExp, zSig );
4285
4286 }
4287
4288 /*----------------------------------------------------------------------------
4289 | Returns the result of converting the quadruple-precision floating-point
4290 | value `a' to the double-precision floating-point format.  The conversion
4291 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4292 | Arithmetic.
4293 *----------------------------------------------------------------------------*/
4294
4295 float64 float128_to_float64( float128 a )
4296 {
4297     flag aSign;
4298     int32 aExp;
4299     bits64 aSig0, aSig1;
4300
4301     aSig1 = extractFloat128Frac1( a );
4302     aSig0 = extractFloat128Frac0( a );
4303     aExp = extractFloat128Exp( a );
4304     aSign = extractFloat128Sign( a );
4305     if ( aExp == 0x7FFF ) {
4306         if ( aSig0 | aSig1 ) {
4307             return commonNaNToFloat64( float128ToCommonNaN( a ) );
4308         }
4309         return packFloat64( aSign, 0x7FF, 0 );
4310     }
4311     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4312     aSig0 |= ( aSig1 != 0 );
4313     if ( aExp || aSig0 ) {
4314         aSig0 |= LIT64( 0x4000000000000000 );
4315         aExp -= 0x3C01;
4316     }
4317     return roundAndPackFloat64( aSign, aExp, aSig0 );
4318
4319 }
4320
4321 #ifdef FLOATX80
4322
4323 /*----------------------------------------------------------------------------
4324 | Returns the result of converting the quadruple-precision floating-point
4325 | value `a' to the extended double-precision floating-point format.  The
4326 | conversion is performed according to the IEC/IEEE Standard for Binary
4327 | Floating-Point Arithmetic.
4328 *----------------------------------------------------------------------------*/
4329
4330 floatx80 float128_to_floatx80( float128 a )
4331 {
4332     flag aSign;
4333     int32 aExp;
4334     bits64 aSig0, aSig1;
4335
4336     aSig1 = extractFloat128Frac1( a );
4337     aSig0 = extractFloat128Frac0( a );
4338     aExp = extractFloat128Exp( a );
4339     aSign = extractFloat128Sign( a );
4340     if ( aExp == 0x7FFF ) {
4341         if ( aSig0 | aSig1 ) {
4342             return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4343         }
4344         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4345     }
4346     if ( aExp == 0 ) {
4347         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4348         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4349     }
4350     else {
4351         aSig0 |= LIT64( 0x0001000000000000 );
4352     }
4353     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4354     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4355
4356 }
4357
4358 #endif
4359
4360 /*----------------------------------------------------------------------------
4361 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4362 | returns the result as a quadruple-precision floating-point value.  The
4363 | operation is performed according to the IEC/IEEE Standard for Binary
4364 | Floating-Point Arithmetic.
4365 *----------------------------------------------------------------------------*/
4366
4367 float128 float128_round_to_int( float128 a )
4368 {
4369     flag aSign;
4370     int32 aExp;
4371     bits64 lastBitMask, roundBitsMask;
4372     int8 roundingMode;
4373     float128 z;
4374
4375     aExp = extractFloat128Exp( a );
4376     if ( 0x402F <= aExp ) {
4377         if ( 0x406F <= aExp ) {
4378             if (    ( aExp == 0x7FFF )
4379                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4380                ) {
4381                 return propagateFloat128NaN( a, a );
4382             }
4383             return a;
4384         }
4385         lastBitMask = 1;
4386         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4387         roundBitsMask = lastBitMask - 1;
4388         z = a;
4389         roundingMode = float_rounding_mode;
4390         if ( roundingMode == float_round_nearest_even ) {
4391             if ( lastBitMask ) {
4392                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4393                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4394             }
4395             else {
4396                 if ( (sbits64) z.low < 0 ) {
4397                     ++z.high;
4398                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4399                 }
4400             }
4401         }
4402         else if ( roundingMode != float_round_to_zero ) {
4403             if (   extractFloat128Sign( z )
4404                  ^ ( roundingMode == float_round_up ) ) {
4405                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4406             }
4407         }
4408         z.low &= ~ roundBitsMask;
4409     }
4410     else {
4411         if ( aExp < 0x3FFF ) {
4412             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4413             float_exception_flags |= float_flag_inexact;
4414             aSign = extractFloat128Sign( a );
4415             switch ( float_rounding_mode ) {
4416              case float_round_nearest_even:
4417                 if (    ( aExp == 0x3FFE )
4418                      && (   extractFloat128Frac0( a )
4419                           | extractFloat128Frac1( a ) )
4420                    ) {
4421                     return packFloat128( aSign, 0x3FFF, 0, 0 );
4422                 }
4423                 break;
4424              case float_round_down:
4425                 return
4426                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4427                     : packFloat128( 0, 0, 0, 0 );
4428              case float_round_up:
4429                 return
4430                       aSign ? packFloat128( 1, 0, 0, 0 )
4431                     : packFloat128( 0, 0x3FFF, 0, 0 );
4432             }
4433             return packFloat128( aSign, 0, 0, 0 );
4434         }
4435         lastBitMask = 1;
4436         lastBitMask <<= 0x402F - aExp;
4437         roundBitsMask = lastBitMask - 1;
4438         z.low = 0;
4439         z.high = a.high;
4440         roundingMode = float_rounding_mode;
4441         if ( roundingMode == float_round_nearest_even ) {
4442             z.high += lastBitMask>>1;
4443             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4444                 z.high &= ~ lastBitMask;
4445             }
4446         }
4447         else if ( roundingMode != float_round_to_zero ) {
4448             if (   extractFloat128Sign( z )
4449                  ^ ( roundingMode == float_round_up ) ) {
4450                 z.high |= ( a.low != 0 );
4451                 z.high += roundBitsMask;
4452             }
4453         }
4454         z.high &= ~ roundBitsMask;
4455     }
4456     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4457         float_exception_flags |= float_flag_inexact;
4458     }
4459     return z;
4460
4461 }
4462
4463 /*----------------------------------------------------------------------------
4464 | Returns the result of adding the absolute values of the quadruple-precision
4465 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
4466 | before being returned.  `zSign' is ignored if the result is a NaN.
4467 | The addition is performed according to the IEC/IEEE Standard for Binary
4468 | Floating-Point Arithmetic.
4469 *----------------------------------------------------------------------------*/
4470
4471 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4472 {
4473     int32 aExp, bExp, zExp;
4474     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4475     int32 expDiff;
4476
4477     aSig1 = extractFloat128Frac1( a );
4478     aSig0 = extractFloat128Frac0( a );
4479     aExp = extractFloat128Exp( a );
4480     bSig1 = extractFloat128Frac1( b );
4481     bSig0 = extractFloat128Frac0( b );
4482     bExp = extractFloat128Exp( b );
4483     expDiff = aExp - bExp;
4484     if ( 0 < expDiff ) {
4485         if ( aExp == 0x7FFF ) {
4486             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4487             return a;
4488         }
4489         if ( bExp == 0 ) {
4490             --expDiff;
4491         }
4492         else {
4493             bSig0 |= LIT64( 0x0001000000000000 );
4494         }
4495         shift128ExtraRightJamming(
4496             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4497         zExp = aExp;
4498     }
4499     else if ( expDiff < 0 ) {
4500         if ( bExp == 0x7FFF ) {
4501             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4502             return packFloat128( zSign, 0x7FFF, 0, 0 );
4503         }
4504         if ( aExp == 0 ) {
4505             ++expDiff;
4506         }
4507         else {
4508             aSig0 |= LIT64( 0x0001000000000000 );
4509         }
4510         shift128ExtraRightJamming(
4511             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4512         zExp = bExp;
4513     }
4514     else {
4515         if ( aExp == 0x7FFF ) {
4516             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4517                 return propagateFloat128NaN( a, b );
4518             }
4519             return a;
4520         }
4521         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4522         if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4523         zSig2 = 0;
4524         zSig0 |= LIT64( 0x0002000000000000 );
4525         zExp = aExp;
4526         goto shiftRight1;
4527     }
4528     aSig0 |= LIT64( 0x0001000000000000 );
4529     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4530     --zExp;
4531     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4532     ++zExp;
4533  shiftRight1:
4534     shift128ExtraRightJamming(
4535         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4536  roundAndPack:
4537     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4538
4539 }
4540
4541 /*----------------------------------------------------------------------------
4542 | Returns the result of subtracting the absolute values of the quadruple-
4543 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
4544 | difference is negated before being returned.  `zSign' is ignored if the
4545 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
4546 | Standard for Binary Floating-Point Arithmetic.
4547 *----------------------------------------------------------------------------*/
4548
4549 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4550 {
4551     int32 aExp, bExp, zExp;
4552     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4553     int32 expDiff;
4554     float128 z;
4555
4556     aSig1 = extractFloat128Frac1( a );
4557     aSig0 = extractFloat128Frac0( a );
4558     aExp = extractFloat128Exp( a );
4559     bSig1 = extractFloat128Frac1( b );
4560     bSig0 = extractFloat128Frac0( b );
4561     bExp = extractFloat128Exp( b );
4562     expDiff = aExp - bExp;
4563     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4564     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4565     if ( 0 < expDiff ) goto aExpBigger;
4566     if ( expDiff < 0 ) goto bExpBigger;
4567     if ( aExp == 0x7FFF ) {
4568         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4569             return propagateFloat128NaN( a, b );
4570         }
4571         float_raise( float_flag_invalid );
4572         z.low = float128_default_nan_low;
4573         z.high = float128_default_nan_high;
4574         return z;
4575     }
4576     if ( aExp == 0 ) {
4577         aExp = 1;
4578         bExp = 1;
4579     }
4580     if ( bSig0 < aSig0 ) goto aBigger;
4581     if ( aSig0 < bSig0 ) goto bBigger;
4582     if ( bSig1 < aSig1 ) goto aBigger;
4583     if ( aSig1 < bSig1 ) goto bBigger;
4584     return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4585  bExpBigger:
4586     if ( bExp == 0x7FFF ) {
4587         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4588         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4589     }
4590     if ( aExp == 0 ) {
4591         ++expDiff;
4592     }
4593     else {
4594         aSig0 |= LIT64( 0x4000000000000000 );
4595     }
4596     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4597     bSig0 |= LIT64( 0x4000000000000000 );
4598  bBigger:
4599     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4600     zExp = bExp;
4601     zSign ^= 1;
4602     goto normalizeRoundAndPack;
4603  aExpBigger:
4604     if ( aExp == 0x7FFF ) {
4605         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4606         return a;
4607     }
4608     if ( bExp == 0 ) {
4609         --expDiff;
4610     }
4611     else {
4612         bSig0 |= LIT64( 0x4000000000000000 );
4613     }
4614     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4615     aSig0 |= LIT64( 0x4000000000000000 );
4616  aBigger:
4617     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4618     zExp = aExp;
4619  normalizeRoundAndPack:
4620     --zExp;
4621     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4622
4623 }
4624
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of adding the quadruple-precision floating-point values
4627 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
4628 | for Binary Floating-Point Arithmetic.
4629 *----------------------------------------------------------------------------*/
4630
4631 float128 float128_add( float128 a, float128 b )
4632 {
4633     flag aSign, bSign;
4634
4635     aSign = extractFloat128Sign( a );
4636     bSign = extractFloat128Sign( b );
4637     if ( aSign == bSign ) {
4638         return addFloat128Sigs( a, b, aSign );
4639     }
4640     else {
4641         return subFloat128Sigs( a, b, aSign );
4642     }
4643
4644 }
4645
4646 /*----------------------------------------------------------------------------
4647 | Returns the result of subtracting the quadruple-precision floating-point
4648 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4649 | Standard for Binary Floating-Point Arithmetic.
4650 *----------------------------------------------------------------------------*/
4651
4652 float128 float128_sub( float128 a, float128 b )
4653 {
4654     flag aSign, bSign;
4655
4656     aSign = extractFloat128Sign( a );
4657     bSign = extractFloat128Sign( b );
4658     if ( aSign == bSign ) {
4659         return subFloat128Sigs( a, b, aSign );
4660     }
4661     else {
4662         return addFloat128Sigs( a, b, aSign );
4663     }
4664
4665 }
4666
4667 /*----------------------------------------------------------------------------
4668 | Returns the result of multiplying the quadruple-precision floating-point
4669 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
4670 | Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672
4673 float128 float128_mul( float128 a, float128 b )
4674 {
4675     flag aSign, bSign, zSign;
4676     int32 aExp, bExp, zExp;
4677     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4678     float128 z;
4679
4680     aSig1 = extractFloat128Frac1( a );
4681     aSig0 = extractFloat128Frac0( a );
4682     aExp = extractFloat128Exp( a );
4683     aSign = extractFloat128Sign( a );
4684     bSig1 = extractFloat128Frac1( b );
4685     bSig0 = extractFloat128Frac0( b );
4686     bExp = extractFloat128Exp( b );
4687     bSign = extractFloat128Sign( b );
4688     zSign = aSign ^ bSign;
4689     if ( aExp == 0x7FFF ) {
4690         if (    ( aSig0 | aSig1 )
4691              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4692             return propagateFloat128NaN( a, b );
4693         }
4694         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4695         return packFloat128( zSign, 0x7FFF, 0, 0 );
4696     }
4697     if ( bExp == 0x7FFF ) {
4698         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4699         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4700  invalid:
4701             float_raise( float_flag_invalid );
4702             z.low = float128_default_nan_low;
4703             z.high = float128_default_nan_high;
4704             return z;
4705         }
4706         return packFloat128( zSign, 0x7FFF, 0, 0 );
4707     }
4708     if ( aExp == 0 ) {
4709         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4710         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4711     }
4712     if ( bExp == 0 ) {
4713         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4714         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4715     }
4716     zExp = aExp + bExp - 0x4000;
4717     aSig0 |= LIT64( 0x0001000000000000 );
4718     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4719     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4720     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4721     zSig2 |= ( zSig3 != 0 );
4722     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4723         shift128ExtraRightJamming(
4724             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4725         ++zExp;
4726     }
4727     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4728
4729 }
4730
4731 /*----------------------------------------------------------------------------
4732 | Returns the result of dividing the quadruple-precision floating-point value
4733 | `a' by the corresponding value `b'.  The operation is performed according to
4734 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4735 *----------------------------------------------------------------------------*/
4736
4737 float128 float128_div( float128 a, float128 b )
4738 {
4739     flag aSign, bSign, zSign;
4740     int32 aExp, bExp, zExp;
4741     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4742     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4743     float128 z;
4744
4745     aSig1 = extractFloat128Frac1( a );
4746     aSig0 = extractFloat128Frac0( a );
4747     aExp = extractFloat128Exp( a );
4748     aSign = extractFloat128Sign( a );
4749     bSig1 = extractFloat128Frac1( b );
4750     bSig0 = extractFloat128Frac0( b );
4751     bExp = extractFloat128Exp( b );
4752     bSign = extractFloat128Sign( b );
4753     zSign = aSign ^ bSign;
4754     if ( aExp == 0x7FFF ) {
4755         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4756         if ( bExp == 0x7FFF ) {
4757             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4758             goto invalid;
4759         }
4760         return packFloat128( zSign, 0x7FFF, 0, 0 );
4761     }
4762     if ( bExp == 0x7FFF ) {
4763         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4764         return packFloat128( zSign, 0, 0, 0 );
4765     }
4766     if ( bExp == 0 ) {
4767         if ( ( bSig0 | bSig1 ) == 0 ) {
4768             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4769  invalid:
4770                 float_raise( float_flag_invalid );
4771                 z.low = float128_default_nan_low;
4772                 z.high = float128_default_nan_high;
4773                 return z;
4774             }
4775             float_raise( float_flag_divbyzero );
4776             return packFloat128( zSign, 0x7FFF, 0, 0 );
4777         }
4778         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4779     }
4780     if ( aExp == 0 ) {
4781         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4782         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4783     }
4784     zExp = aExp - bExp + 0x3FFD;
4785     shortShift128Left(
4786         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4787     shortShift128Left(
4788         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4789     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4790         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4791         ++zExp;
4792     }
4793     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4794     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4795     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4796     while ( (sbits64) rem0 < 0 ) {
4797         --zSig0;
4798         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4799     }
4800     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4801     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4802         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4803         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4804         while ( (sbits64) rem1 < 0 ) {
4805             --zSig1;
4806             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4807         }
4808         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4809     }
4810     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4811     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4812
4813 }
4814
4815 /*----------------------------------------------------------------------------
4816 | Returns the remainder of the quadruple-precision floating-point value `a'
4817 | with respect to the corresponding value `b'.  The operation is performed
4818 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4819 *----------------------------------------------------------------------------*/
4820
4821 float128 float128_rem( float128 a, float128 b )
4822 {
4823     flag aSign, bSign, zSign;
4824     int32 aExp, bExp, expDiff;
4825     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4826     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4827     sbits64 sigMean0;
4828     float128 z;
4829
4830     aSig1 = extractFloat128Frac1( a );
4831     aSig0 = extractFloat128Frac0( a );
4832     aExp = extractFloat128Exp( a );
4833     aSign = extractFloat128Sign( a );
4834     bSig1 = extractFloat128Frac1( b );
4835     bSig0 = extractFloat128Frac0( b );
4836     bExp = extractFloat128Exp( b );
4837     bSign = extractFloat128Sign( b );
4838     if ( aExp == 0x7FFF ) {
4839         if (    ( aSig0 | aSig1 )
4840              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4841             return propagateFloat128NaN( a, b );
4842         }
4843         goto invalid;
4844     }
4845     if ( bExp == 0x7FFF ) {
4846         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4847         return a;
4848     }
4849     if ( bExp == 0 ) {
4850         if ( ( bSig0 | bSig1 ) == 0 ) {
4851  invalid:
4852             float_raise( float_flag_invalid );
4853             z.low = float128_default_nan_low;
4854             z.high = float128_default_nan_high;
4855             return z;
4856         }
4857         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4858     }
4859     if ( aExp == 0 ) {
4860         if ( ( aSig0 | aSig1 ) == 0 ) return a;
4861         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4862     }
4863     expDiff = aExp - bExp;
4864     if ( expDiff < -1 ) return a;
4865     shortShift128Left(
4866         aSig0 | LIT64( 0x0001000000000000 ),
4867         aSig1,
4868         15 - ( expDiff < 0 ),
4869         &aSig0,
4870         &aSig1
4871     );
4872     shortShift128Left(
4873         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4874     q = le128( bSig0, bSig1, aSig0, aSig1 );
4875     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4876     expDiff -= 64;
4877     while ( 0 < expDiff ) {
4878         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4879         q = ( 4 < q ) ? q - 4 : 0;
4880         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4881         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4882         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4883         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4884         expDiff -= 61;
4885     }
4886     if ( -64 < expDiff ) {
4887         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4888         q = ( 4 < q ) ? q - 4 : 0;
4889         q >>= - expDiff;
4890         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4891         expDiff += 52;
4892         if ( expDiff < 0 ) {
4893             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4894         }
4895         else {
4896             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4897         }
4898         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4899         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4900     }
4901     else {
4902         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4903         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904     }
4905     do {
4906         alternateASig0 = aSig0;
4907         alternateASig1 = aSig1;
4908         ++q;
4909         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4910     } while ( 0 <= (sbits64) aSig0 );
4911     add128(
4912         aSig0, aSig1, alternateASig0, alternateASig1, (bits64*)&sigMean0, &sigMean1 );
4913     if (    ( sigMean0 < 0 )
4914          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4915         aSig0 = alternateASig0;
4916         aSig1 = alternateASig1;
4917     }
4918     zSign = ( (sbits64) aSig0 < 0 );
4919     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4920     return
4921         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4922
4923 }
4924
4925 /*----------------------------------------------------------------------------
4926 | Returns the square root of the quadruple-precision floating-point value `a'.
4927 | The operation is performed according to the IEC/IEEE Standard for Binary
4928 | Floating-Point Arithmetic.
4929 *----------------------------------------------------------------------------*/
4930
4931 float128 float128_sqrt( float128 a )
4932 {
4933     flag aSign;
4934     int32 aExp, zExp;
4935     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4936     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4937     float128 z;
4938
4939     aSig1 = extractFloat128Frac1( a );
4940     aSig0 = extractFloat128Frac0( a );
4941     aExp = extractFloat128Exp( a );
4942     aSign = extractFloat128Sign( a );
4943     if ( aExp == 0x7FFF ) {
4944         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4945         if ( ! aSign ) return a;
4946         goto invalid;
4947     }
4948     if ( aSign ) {
4949         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4950  invalid:
4951         float_raise( float_flag_invalid );
4952         z.low = float128_default_nan_low;
4953         z.high = float128_default_nan_high;
4954         return z;
4955     }
4956     if ( aExp == 0 ) {
4957         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4958         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4959     }
4960     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4961     aSig0 |= LIT64( 0x0001000000000000 );
4962     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4963     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4964     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4965     doubleZSig0 = zSig0<<1;
4966     mul64To128( zSig0, zSig0, &term0, &term1 );
4967     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4968     while ( (sbits64) rem0 < 0 ) {
4969         --zSig0;
4970         doubleZSig0 -= 2;
4971         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4972     }
4973     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4974     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4975         if ( zSig1 == 0 ) zSig1 = 1;
4976         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4977         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4978         mul64To128( zSig1, zSig1, &term2, &term3 );
4979         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4980         while ( (sbits64) rem1 < 0 ) {
4981             --zSig1;
4982             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4983             term3 |= 1;
4984             term2 |= doubleZSig0;
4985             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4986         }
4987         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4988     }
4989     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4990     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4991
4992 }
4993
4994 /*----------------------------------------------------------------------------
4995 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4996 | the corresponding value `b', and 0 otherwise.  The comparison is performed
4997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4998 *----------------------------------------------------------------------------*/
4999
5000 flag float128_eq( float128 a, float128 b )
5001 {
5002
5003     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5004               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5005          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5006               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5007        ) {
5008         if (    float128_is_signaling_nan( a )
5009              || float128_is_signaling_nan( b ) ) {
5010             float_raise( float_flag_invalid );
5011         }
5012         return 0;
5013     }
5014     return
5015            ( a.low == b.low )
5016         && (    ( a.high == b.high )
5017              || (    ( a.low == 0 )
5018                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5019            );
5020
5021 }
5022
5023 /*----------------------------------------------------------------------------
5024 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5025 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
5026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5027 | Arithmetic.
5028 *----------------------------------------------------------------------------*/
5029
5030 flag float128_le( float128 a, float128 b )
5031 {
5032     flag aSign, bSign;
5033
5034     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5035               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5036          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5037               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5038        ) {
5039         float_raise( float_flag_invalid );
5040         return 0;
5041     }
5042     aSign = extractFloat128Sign( a );
5043     bSign = extractFloat128Sign( b );
5044     if ( aSign != bSign ) {
5045         return
5046                aSign
5047             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5048                  == 0 );
5049     }
5050     return
5051           aSign ? le128( b.high, b.low, a.high, a.low )
5052         : le128( a.high, a.low, b.high, b.low );
5053
5054 }
5055
5056 /*----------------------------------------------------------------------------
5057 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5058 | the corresponding value `b', and 0 otherwise.  The comparison is performed
5059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5060 *----------------------------------------------------------------------------*/
5061
5062 flag float128_lt( float128 a, float128 b )
5063 {
5064     flag aSign, bSign;
5065
5066     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5067               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5068          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5069               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5070        ) {
5071         float_raise( float_flag_invalid );
5072         return 0;
5073     }
5074     aSign = extractFloat128Sign( a );
5075     bSign = extractFloat128Sign( b );
5076     if ( aSign != bSign ) {
5077         return
5078                aSign
5079             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5080                  != 0 );
5081     }
5082     return
5083           aSign ? lt128( b.high, b.low, a.high, a.low )
5084         : lt128( a.high, a.low, b.high, b.low );
5085
5086 }
5087
5088 /*----------------------------------------------------------------------------
5089 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5090 | the corresponding value `b', and 0 otherwise.  The invalid exception is
5091 | raised if either operand is a NaN.  Otherwise, the comparison is performed
5092 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094
5095 flag float128_eq_signaling( float128 a, float128 b )
5096 {
5097
5098     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5099               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5100          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5101               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5102        ) {
5103         float_raise( float_flag_invalid );
5104         return 0;
5105     }
5106     return
5107            ( a.low == b.low )
5108         && (    ( a.high == b.high )
5109              || (    ( a.low == 0 )
5110                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5111            );
5112
5113 }
5114
5115 /*----------------------------------------------------------------------------
5116 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5117 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
5118 | cause an exception.  Otherwise, the comparison is performed according to the
5119 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121
5122 flag float128_le_quiet( float128 a, float128 b )
5123 {
5124     flag aSign, bSign;
5125
5126     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5127               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5128          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5129               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5130        ) {
5131         if (    float128_is_signaling_nan( a )
5132              || float128_is_signaling_nan( b ) ) {
5133             float_raise( float_flag_invalid );
5134         }
5135         return 0;
5136     }
5137     aSign = extractFloat128Sign( a );
5138     bSign = extractFloat128Sign( b );
5139     if ( aSign != bSign ) {
5140         return
5141                aSign
5142             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5143                  == 0 );
5144     }
5145     return
5146           aSign ? le128( b.high, b.low, a.high, a.low )
5147         : le128( a.high, a.low, b.high, b.low );
5148
5149 }
5150
5151 /*----------------------------------------------------------------------------
5152 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5153 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
5154 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
5155 | Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5157
5158 flag float128_lt_quiet( float128 a, float128 b )
5159 {
5160     flag aSign, bSign;
5161
5162     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
5163               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5164          || (    ( extractFloat128Exp( b ) == 0x7FFF )
5165               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5166        ) {
5167         if (    float128_is_signaling_nan( a )
5168              || float128_is_signaling_nan( b ) ) {
5169             float_raise( float_flag_invalid );
5170         }
5171         return 0;
5172     }
5173     aSign = extractFloat128Sign( a );
5174     bSign = extractFloat128Sign( b );
5175     if ( aSign != bSign ) {
5176         return
5177                aSign
5178             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5179                  != 0 );
5180     }
5181     return
5182           aSign ? lt128( b.high, b.low, a.high, a.low )
5183         : lt128( a.high, a.low, b.high, b.low );
5184
5185 }
5186
5187 #endif
5188