[xcc,sim] implement FP using softfloat
[riscv-isa-sim.git] / softfloat / softfloat.c
1
2 /*============================================================================
3
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
5 Package, Release 2b.
6
7 Written by John R. Hauser. This work was made possible in part by the
8 International Computer Science Institute, located at Suite 600, 1947 Center
9 Street, Berkeley, California 94704. Funding was partially provided by the
10 National Science Foundation under grant MIP-9311980. The original version
11 of this code was written as part of a project to build a fixed-point vector
12 processor in collaboration with the University of California at Berkeley,
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
15 arithmetic/SoftFloat.html'.
16
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
25
26 Derivative works are acceptable, even for commercial purposes, so long as
27 (1) the source code for the derivative work includes prominent notice that
28 the work is derivative, and (2) the source code includes prominent notice with
29 these four paragraphs for those parts of this code that are retained.
30
31 =============================================================================*/
32
33 #include "milieu.h"
34 #include "softfloat.h"
35
36 /*----------------------------------------------------------------------------
37 | Floating-point rounding mode, extended double-precision rounding precision,
38 | and exception flags.
39 *----------------------------------------------------------------------------*/
40 int8 float_rounding_mode = float_round_nearest_even;
41 int8 float_exception_flags = 0;
42 #ifdef FLOATX80
43 int8 floatx80_rounding_precision = 80;
44 #endif
45
46 /*----------------------------------------------------------------------------
47 | Primitive arithmetic functions, including multi-word arithmetic, and
48 | division and square root approximations. (Can be specialized to target if
49 | desired.)
50 *----------------------------------------------------------------------------*/
51 #include "softfloat-macros"
52
53 /*----------------------------------------------------------------------------
54 | Functions and definitions to determine: (1) whether tininess for underflow
55 | is detected before or after rounding by default, (2) what (if anything)
56 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
57 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
58 | are propagated from function inputs to output. These details are target-
59 | specific.
60 *----------------------------------------------------------------------------*/
61 #include "softfloat-specialize"
62
63 /*----------------------------------------------------------------------------
64 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
65 | and 7, and returns the properly rounded 32-bit integer corresponding to the
66 | input. If `zSign' is 1, the input is negated before being converted to an
67 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
68 | is simply rounded to an integer, with the inexact exception raised if the
69 | input cannot be represented exactly as an integer. However, if the fixed-
70 | point input is too large, the invalid exception is raised and the largest
71 | positive or negative integer is returned.
72 *----------------------------------------------------------------------------*/
73
74 static int32 roundAndPackInt32( flag zSign, bits64 absZ )
75 {
76 int8 roundingMode;
77 flag roundNearestEven;
78 int8 roundIncrement, roundBits;
79 int32 z;
80
81 roundingMode = float_rounding_mode;
82 roundNearestEven = ( roundingMode == float_round_nearest_even );
83 roundIncrement = 0x40;
84 if ( ! roundNearestEven ) {
85 if ( roundingMode == float_round_to_zero ) {
86 roundIncrement = 0;
87 }
88 else {
89 roundIncrement = 0x7F;
90 if ( zSign ) {
91 if ( roundingMode == float_round_up ) roundIncrement = 0;
92 }
93 else {
94 if ( roundingMode == float_round_down ) roundIncrement = 0;
95 }
96 }
97 }
98 roundBits = absZ & 0x7F;
99 absZ = ( absZ + roundIncrement )>>7;
100 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
101 z = absZ;
102 if ( zSign ) z = - z;
103 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
104 float_raise( float_flag_invalid );
105 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
106 }
107 if ( roundBits ) float_exception_flags |= float_flag_inexact;
108 return z;
109
110 }
111
112 /*----------------------------------------------------------------------------
113 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
114 | `absZ1', with binary point between bits 63 and 64 (between the input words),
115 | and returns the properly rounded 64-bit integer corresponding to the input.
116 | If `zSign' is 1, the input is negated before being converted to an integer.
117 | Ordinarily, the fixed-point input is simply rounded to an integer, with
118 | the inexact exception raised if the input cannot be represented exactly as
119 | an integer. However, if the fixed-point input is too large, the invalid
120 | exception is raised and the largest positive or negative integer is
121 | returned.
122 *----------------------------------------------------------------------------*/
123
124 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 )
125 {
126 int8 roundingMode;
127 flag roundNearestEven, increment;
128 int64 z;
129
130 roundingMode = float_rounding_mode;
131 roundNearestEven = ( roundingMode == float_round_nearest_even );
132 increment = ( (sbits64) absZ1 < 0 );
133 if ( ! roundNearestEven ) {
134 if ( roundingMode == float_round_to_zero ) {
135 increment = 0;
136 }
137 else {
138 if ( zSign ) {
139 increment = ( roundingMode == float_round_down ) && absZ1;
140 }
141 else {
142 increment = ( roundingMode == float_round_up ) && absZ1;
143 }
144 }
145 }
146 if ( increment ) {
147 ++absZ0;
148 if ( absZ0 == 0 ) goto overflow;
149 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
150 }
151 z = absZ0;
152 if ( zSign ) z = - z;
153 if ( z && ( ( z < 0 ) ^ zSign ) ) {
154 overflow:
155 float_raise( float_flag_invalid );
156 return
157 zSign ? (sbits64) LIT64( 0x8000000000000000 )
158 : LIT64( 0x7FFFFFFFFFFFFFFF );
159 }
160 if ( absZ1 ) float_exception_flags |= float_flag_inexact;
161 return z;
162
163 }
164
165 /*----------------------------------------------------------------------------
166 | Returns the fraction bits of the single-precision floating-point value `a'.
167 *----------------------------------------------------------------------------*/
168
169 INLINE bits32 extractFloat32Frac( float32 a )
170 {
171
172 return a & 0x007FFFFF;
173
174 }
175
176 /*----------------------------------------------------------------------------
177 | Returns the exponent bits of the single-precision floating-point value `a'.
178 *----------------------------------------------------------------------------*/
179
180 INLINE int16 extractFloat32Exp( float32 a )
181 {
182
183 return ( a>>23 ) & 0xFF;
184
185 }
186
187 /*----------------------------------------------------------------------------
188 | Returns the sign bit of the single-precision floating-point value `a'.
189 *----------------------------------------------------------------------------*/
190
191 INLINE flag extractFloat32Sign( float32 a )
192 {
193
194 return a>>31;
195
196 }
197
198 /*----------------------------------------------------------------------------
199 | Normalizes the subnormal single-precision floating-point value represented
200 | by the denormalized significand `aSig'. The normalized exponent and
201 | significand are stored at the locations pointed to by `zExpPtr' and
202 | `zSigPtr', respectively.
203 *----------------------------------------------------------------------------*/
204
205 static void
206 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
207 {
208 int8 shiftCount;
209
210 shiftCount = countLeadingZeros32( aSig ) - 8;
211 *zSigPtr = aSig<<shiftCount;
212 *zExpPtr = 1 - shiftCount;
213
214 }
215
216 /*----------------------------------------------------------------------------
217 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
218 | single-precision floating-point value, returning the result. After being
219 | shifted into the proper positions, the three fields are simply added
220 | together to form the result. This means that any integer portion of `zSig'
221 | will be added into the exponent. Since a properly normalized significand
222 | will have an integer portion equal to 1, the `zExp' input should be 1 less
223 | than the desired result exponent whenever `zSig' is a complete, normalized
224 | significand.
225 *----------------------------------------------------------------------------*/
226
227 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
228 {
229
230 return ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig;
231
232 }
233
234 /*----------------------------------------------------------------------------
235 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
236 | and significand `zSig', and returns the proper single-precision floating-
237 | point value corresponding to the abstract input. Ordinarily, the abstract
238 | value is simply rounded and packed into the single-precision format, with
239 | the inexact exception raised if the abstract input cannot be represented
240 | exactly. However, if the abstract value is too large, the overflow and
241 | inexact exceptions are raised and an infinity or maximal finite value is
242 | returned. If the abstract value is too small, the input value is rounded to
243 | a subnormal number, and the underflow and inexact exceptions are raised if
244 | the abstract input cannot be represented exactly as a subnormal single-
245 | precision floating-point number.
246 | The input significand `zSig' has its binary point between bits 30
247 | and 29, which is 7 bits to the left of the usual location. This shifted
248 | significand must be normalized or smaller. If `zSig' is not normalized,
249 | `zExp' must be 0; in that case, the result returned is a subnormal number,
250 | and it must not require rounding. In the usual case that `zSig' is
251 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
252 | The handling of underflow and overflow follows the IEC/IEEE Standard for
253 | Binary Floating-Point Arithmetic.
254 *----------------------------------------------------------------------------*/
255
256 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
257 {
258 int8 roundingMode;
259 flag roundNearestEven;
260 int8 roundIncrement, roundBits;
261 flag isTiny;
262
263 roundingMode = float_rounding_mode;
264 roundNearestEven = ( roundingMode == float_round_nearest_even );
265 roundIncrement = 0x40;
266 if ( ! roundNearestEven ) {
267 if ( roundingMode == float_round_to_zero ) {
268 roundIncrement = 0;
269 }
270 else {
271 roundIncrement = 0x7F;
272 if ( zSign ) {
273 if ( roundingMode == float_round_up ) roundIncrement = 0;
274 }
275 else {
276 if ( roundingMode == float_round_down ) roundIncrement = 0;
277 }
278 }
279 }
280 roundBits = zSig & 0x7F;
281 if ( 0xFD <= (bits16) zExp ) {
282 if ( ( 0xFD < zExp )
283 || ( ( zExp == 0xFD )
284 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
285 ) {
286 float_raise( float_flag_overflow | float_flag_inexact );
287 return packFloat32( zSign, 0xFF, 0 ) - ( roundIncrement == 0 );
288 }
289 if ( zExp < 0 ) {
290 isTiny =
291 ( float_detect_tininess == float_tininess_before_rounding )
292 || ( zExp < -1 )
293 || ( zSig + roundIncrement < 0x80000000 );
294 shift32RightJamming( zSig, - zExp, &zSig );
295 zExp = 0;
296 roundBits = zSig & 0x7F;
297 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
298 }
299 }
300 if ( roundBits ) float_exception_flags |= float_flag_inexact;
301 zSig = ( zSig + roundIncrement )>>7;
302 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
303 if ( zSig == 0 ) zExp = 0;
304 return packFloat32( zSign, zExp, zSig );
305
306 }
307
308 /*----------------------------------------------------------------------------
309 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
310 | and significand `zSig', and returns the proper single-precision floating-
311 | point value corresponding to the abstract input. This routine is just like
312 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
313 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
314 | floating-point exponent.
315 *----------------------------------------------------------------------------*/
316
317 static float32
318 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig )
319 {
320 int8 shiftCount;
321
322 shiftCount = countLeadingZeros32( zSig ) - 1;
323 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount );
324
325 }
326
327 /*----------------------------------------------------------------------------
328 | Returns the fraction bits of the double-precision floating-point value `a'.
329 *----------------------------------------------------------------------------*/
330
331 INLINE bits64 extractFloat64Frac( float64 a )
332 {
333
334 return a & LIT64( 0x000FFFFFFFFFFFFF );
335
336 }
337
338 /*----------------------------------------------------------------------------
339 | Returns the exponent bits of the double-precision floating-point value `a'.
340 *----------------------------------------------------------------------------*/
341
342 INLINE int16 extractFloat64Exp( float64 a )
343 {
344
345 return ( a>>52 ) & 0x7FF;
346
347 }
348
349 /*----------------------------------------------------------------------------
350 | Returns the sign bit of the double-precision floating-point value `a'.
351 *----------------------------------------------------------------------------*/
352
353 INLINE flag extractFloat64Sign( float64 a )
354 {
355
356 return a>>63;
357
358 }
359
360 /*----------------------------------------------------------------------------
361 | Normalizes the subnormal double-precision floating-point value represented
362 | by the denormalized significand `aSig'. The normalized exponent and
363 | significand are stored at the locations pointed to by `zExpPtr' and
364 | `zSigPtr', respectively.
365 *----------------------------------------------------------------------------*/
366
367 static void
368 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
369 {
370 int8 shiftCount;
371
372 shiftCount = countLeadingZeros64( aSig ) - 11;
373 *zSigPtr = aSig<<shiftCount;
374 *zExpPtr = 1 - shiftCount;
375
376 }
377
378 /*----------------------------------------------------------------------------
379 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
380 | double-precision floating-point value, returning the result. After being
381 | shifted into the proper positions, the three fields are simply added
382 | together to form the result. This means that any integer portion of `zSig'
383 | will be added into the exponent. Since a properly normalized significand
384 | will have an integer portion equal to 1, the `zExp' input should be 1 less
385 | than the desired result exponent whenever `zSig' is a complete, normalized
386 | significand.
387 *----------------------------------------------------------------------------*/
388
389 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
390 {
391
392 return ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig;
393
394 }
395
396 /*----------------------------------------------------------------------------
397 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
398 | and significand `zSig', and returns the proper double-precision floating-
399 | point value corresponding to the abstract input. Ordinarily, the abstract
400 | value is simply rounded and packed into the double-precision format, with
401 | the inexact exception raised if the abstract input cannot be represented
402 | exactly. However, if the abstract value is too large, the overflow and
403 | inexact exceptions are raised and an infinity or maximal finite value is
404 | returned. If the abstract value is too small, the input value is rounded
405 | to a subnormal number, and the underflow and inexact exceptions are raised
406 | if the abstract input cannot be represented exactly as a subnormal double-
407 | precision floating-point number.
408 | The input significand `zSig' has its binary point between bits 62
409 | and 61, which is 10 bits to the left of the usual location. This shifted
410 | significand must be normalized or smaller. If `zSig' is not normalized,
411 | `zExp' must be 0; in that case, the result returned is a subnormal number,
412 | and it must not require rounding. In the usual case that `zSig' is
413 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
414 | The handling of underflow and overflow follows the IEC/IEEE Standard for
415 | Binary Floating-Point Arithmetic.
416 *----------------------------------------------------------------------------*/
417
418 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
419 {
420 int8 roundingMode;
421 flag roundNearestEven;
422 int16 roundIncrement, roundBits;
423 flag isTiny;
424
425 roundingMode = float_rounding_mode;
426 roundNearestEven = ( roundingMode == float_round_nearest_even );
427 roundIncrement = 0x200;
428 if ( ! roundNearestEven ) {
429 if ( roundingMode == float_round_to_zero ) {
430 roundIncrement = 0;
431 }
432 else {
433 roundIncrement = 0x3FF;
434 if ( zSign ) {
435 if ( roundingMode == float_round_up ) roundIncrement = 0;
436 }
437 else {
438 if ( roundingMode == float_round_down ) roundIncrement = 0;
439 }
440 }
441 }
442 roundBits = zSig & 0x3FF;
443 if ( 0x7FD <= (bits16) zExp ) {
444 if ( ( 0x7FD < zExp )
445 || ( ( zExp == 0x7FD )
446 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
447 ) {
448 float_raise( float_flag_overflow | float_flag_inexact );
449 return packFloat64( zSign, 0x7FF, 0 ) - ( roundIncrement == 0 );
450 }
451 if ( zExp < 0 ) {
452 isTiny =
453 ( float_detect_tininess == float_tininess_before_rounding )
454 || ( zExp < -1 )
455 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
456 shift64RightJamming( zSig, - zExp, &zSig );
457 zExp = 0;
458 roundBits = zSig & 0x3FF;
459 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
460 }
461 }
462 if ( roundBits ) float_exception_flags |= float_flag_inexact;
463 zSig = ( zSig + roundIncrement )>>10;
464 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
465 if ( zSig == 0 ) zExp = 0;
466 return packFloat64( zSign, zExp, zSig );
467
468 }
469
470 /*----------------------------------------------------------------------------
471 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
472 | and significand `zSig', and returns the proper double-precision floating-
473 | point value corresponding to the abstract input. This routine is just like
474 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
475 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
476 | floating-point exponent.
477 *----------------------------------------------------------------------------*/
478
479 static float64
480 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig )
481 {
482 int8 shiftCount;
483
484 shiftCount = countLeadingZeros64( zSig ) - 1;
485 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount );
486
487 }
488
489 #ifdef FLOATX80
490
491 /*----------------------------------------------------------------------------
492 | Returns the fraction bits of the extended double-precision floating-point
493 | value `a'.
494 *----------------------------------------------------------------------------*/
495
496 INLINE bits64 extractFloatx80Frac( floatx80 a )
497 {
498
499 return a.low;
500
501 }
502
503 /*----------------------------------------------------------------------------
504 | Returns the exponent bits of the extended double-precision floating-point
505 | value `a'.
506 *----------------------------------------------------------------------------*/
507
508 INLINE int32 extractFloatx80Exp( floatx80 a )
509 {
510
511 return a.high & 0x7FFF;
512
513 }
514
515 /*----------------------------------------------------------------------------
516 | Returns the sign bit of the extended double-precision floating-point value
517 | `a'.
518 *----------------------------------------------------------------------------*/
519
520 INLINE flag extractFloatx80Sign( floatx80 a )
521 {
522
523 return a.high>>15;
524
525 }
526
527 /*----------------------------------------------------------------------------
528 | Normalizes the subnormal extended double-precision floating-point value
529 | represented by the denormalized significand `aSig'. The normalized exponent
530 | and significand are stored at the locations pointed to by `zExpPtr' and
531 | `zSigPtr', respectively.
532 *----------------------------------------------------------------------------*/
533
534 static void
535 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
536 {
537 int8 shiftCount;
538
539 shiftCount = countLeadingZeros64( aSig );
540 *zSigPtr = aSig<<shiftCount;
541 *zExpPtr = 1 - shiftCount;
542
543 }
544
545 /*----------------------------------------------------------------------------
546 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
547 | extended double-precision floating-point value, returning the result.
548 *----------------------------------------------------------------------------*/
549
550 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
551 {
552 floatx80 z;
553
554 z.low = zSig;
555 z.high = ( ( (bits16) zSign )<<15 ) + zExp;
556 return z;
557
558 }
559
560 /*----------------------------------------------------------------------------
561 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
562 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
563 | and returns the proper extended double-precision floating-point value
564 | corresponding to the abstract input. Ordinarily, the abstract value is
565 | rounded and packed into the extended double-precision format, with the
566 | inexact exception raised if the abstract input cannot be represented
567 | exactly. However, if the abstract value is too large, the overflow and
568 | inexact exceptions are raised and an infinity or maximal finite value is
569 | returned. If the abstract value is too small, the input value is rounded to
570 | a subnormal number, and the underflow and inexact exceptions are raised if
571 | the abstract input cannot be represented exactly as a subnormal extended
572 | double-precision floating-point number.
573 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
574 | number of bits as single or double precision, respectively. Otherwise, the
575 | result is rounded to the full precision of the extended double-precision
576 | format.
577 | The input significand must be normalized or smaller. If the input
578 | significand is not normalized, `zExp' must be 0; in that case, the result
579 | returned is a subnormal number, and it must not require rounding. The
580 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
581 | Floating-Point Arithmetic.
582 *----------------------------------------------------------------------------*/
583
584 static floatx80
585 roundAndPackFloatx80(
586 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
587 )
588 {
589 int8 roundingMode;
590 flag roundNearestEven, increment, isTiny;
591 int64 roundIncrement, roundMask, roundBits;
592
593 roundingMode = float_rounding_mode;
594 roundNearestEven = ( roundingMode == float_round_nearest_even );
595 if ( roundingPrecision == 80 ) goto precision80;
596 if ( roundingPrecision == 64 ) {
597 roundIncrement = LIT64( 0x0000000000000400 );
598 roundMask = LIT64( 0x00000000000007FF );
599 }
600 else if ( roundingPrecision == 32 ) {
601 roundIncrement = LIT64( 0x0000008000000000 );
602 roundMask = LIT64( 0x000000FFFFFFFFFF );
603 }
604 else {
605 goto precision80;
606 }
607 zSig0 |= ( zSig1 != 0 );
608 if ( ! roundNearestEven ) {
609 if ( roundingMode == float_round_to_zero ) {
610 roundIncrement = 0;
611 }
612 else {
613 roundIncrement = roundMask;
614 if ( zSign ) {
615 if ( roundingMode == float_round_up ) roundIncrement = 0;
616 }
617 else {
618 if ( roundingMode == float_round_down ) roundIncrement = 0;
619 }
620 }
621 }
622 roundBits = zSig0 & roundMask;
623 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
624 if ( ( 0x7FFE < zExp )
625 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
626 ) {
627 goto overflow;
628 }
629 if ( zExp <= 0 ) {
630 isTiny =
631 ( float_detect_tininess == float_tininess_before_rounding )
632 || ( zExp < 0 )
633 || ( zSig0 <= zSig0 + roundIncrement );
634 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
635 zExp = 0;
636 roundBits = zSig0 & roundMask;
637 if ( isTiny && roundBits ) float_raise( float_flag_underflow );
638 if ( roundBits ) float_exception_flags |= float_flag_inexact;
639 zSig0 += roundIncrement;
640 if ( (sbits64) zSig0 < 0 ) zExp = 1;
641 roundIncrement = roundMask + 1;
642 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
643 roundMask |= roundIncrement;
644 }
645 zSig0 &= ~ roundMask;
646 return packFloatx80( zSign, zExp, zSig0 );
647 }
648 }
649 if ( roundBits ) float_exception_flags |= float_flag_inexact;
650 zSig0 += roundIncrement;
651 if ( zSig0 < roundIncrement ) {
652 ++zExp;
653 zSig0 = LIT64( 0x8000000000000000 );
654 }
655 roundIncrement = roundMask + 1;
656 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
657 roundMask |= roundIncrement;
658 }
659 zSig0 &= ~ roundMask;
660 if ( zSig0 == 0 ) zExp = 0;
661 return packFloatx80( zSign, zExp, zSig0 );
662 precision80:
663 increment = ( (sbits64) zSig1 < 0 );
664 if ( ! roundNearestEven ) {
665 if ( roundingMode == float_round_to_zero ) {
666 increment = 0;
667 }
668 else {
669 if ( zSign ) {
670 increment = ( roundingMode == float_round_down ) && zSig1;
671 }
672 else {
673 increment = ( roundingMode == float_round_up ) && zSig1;
674 }
675 }
676 }
677 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
678 if ( ( 0x7FFE < zExp )
679 || ( ( zExp == 0x7FFE )
680 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
681 && increment
682 )
683 ) {
684 roundMask = 0;
685 overflow:
686 float_raise( float_flag_overflow | float_flag_inexact );
687 if ( ( roundingMode == float_round_to_zero )
688 || ( zSign && ( roundingMode == float_round_up ) )
689 || ( ! zSign && ( roundingMode == float_round_down ) )
690 ) {
691 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
692 }
693 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
694 }
695 if ( zExp <= 0 ) {
696 isTiny =
697 ( float_detect_tininess == float_tininess_before_rounding )
698 || ( zExp < 0 )
699 || ! increment
700 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
701 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
702 zExp = 0;
703 if ( isTiny && zSig1 ) float_raise( float_flag_underflow );
704 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
705 if ( roundNearestEven ) {
706 increment = ( (sbits64) zSig1 < 0 );
707 }
708 else {
709 if ( zSign ) {
710 increment = ( roundingMode == float_round_down ) && zSig1;
711 }
712 else {
713 increment = ( roundingMode == float_round_up ) && zSig1;
714 }
715 }
716 if ( increment ) {
717 ++zSig0;
718 zSig0 &=
719 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
720 if ( (sbits64) zSig0 < 0 ) zExp = 1;
721 }
722 return packFloatx80( zSign, zExp, zSig0 );
723 }
724 }
725 if ( zSig1 ) float_exception_flags |= float_flag_inexact;
726 if ( increment ) {
727 ++zSig0;
728 if ( zSig0 == 0 ) {
729 ++zExp;
730 zSig0 = LIT64( 0x8000000000000000 );
731 }
732 else {
733 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
734 }
735 }
736 else {
737 if ( zSig0 == 0 ) zExp = 0;
738 }
739 return packFloatx80( zSign, zExp, zSig0 );
740
741 }
742
743 /*----------------------------------------------------------------------------
744 | Takes an abstract floating-point value having sign `zSign', exponent
745 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
746 | and returns the proper extended double-precision floating-point value
747 | corresponding to the abstract input. This routine is just like
748 | `roundAndPackFloatx80' except that the input significand does not have to be
749 | normalized.
750 *----------------------------------------------------------------------------*/
751
752 static floatx80
753 normalizeRoundAndPackFloatx80(
754 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
755 )
756 {
757 int8 shiftCount;
758
759 if ( zSig0 == 0 ) {
760 zSig0 = zSig1;
761 zSig1 = 0;
762 zExp -= 64;
763 }
764 shiftCount = countLeadingZeros64( zSig0 );
765 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
766 zExp -= shiftCount;
767 return
768 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 );
769
770 }
771
772 #endif
773
774 #ifdef FLOAT128
775
776 /*----------------------------------------------------------------------------
777 | Returns the least-significant 64 fraction bits of the quadruple-precision
778 | floating-point value `a'.
779 *----------------------------------------------------------------------------*/
780
781 INLINE bits64 extractFloat128Frac1( float128 a )
782 {
783
784 return a.low;
785
786 }
787
788 /*----------------------------------------------------------------------------
789 | Returns the most-significant 48 fraction bits of the quadruple-precision
790 | floating-point value `a'.
791 *----------------------------------------------------------------------------*/
792
793 INLINE bits64 extractFloat128Frac0( float128 a )
794 {
795
796 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
797
798 }
799
800 /*----------------------------------------------------------------------------
801 | Returns the exponent bits of the quadruple-precision floating-point value
802 | `a'.
803 *----------------------------------------------------------------------------*/
804
805 INLINE int32 extractFloat128Exp( float128 a )
806 {
807
808 return ( a.high>>48 ) & 0x7FFF;
809
810 }
811
812 /*----------------------------------------------------------------------------
813 | Returns the sign bit of the quadruple-precision floating-point value `a'.
814 *----------------------------------------------------------------------------*/
815
816 INLINE flag extractFloat128Sign( float128 a )
817 {
818
819 return a.high>>63;
820
821 }
822
823 /*----------------------------------------------------------------------------
824 | Normalizes the subnormal quadruple-precision floating-point value
825 | represented by the denormalized significand formed by the concatenation of
826 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
827 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
828 | significand are stored at the location pointed to by `zSig0Ptr', and the
829 | least significant 64 bits of the normalized significand are stored at the
830 | location pointed to by `zSig1Ptr'.
831 *----------------------------------------------------------------------------*/
832
833 static void
834 normalizeFloat128Subnormal(
835 bits64 aSig0,
836 bits64 aSig1,
837 int32 *zExpPtr,
838 bits64 *zSig0Ptr,
839 bits64 *zSig1Ptr
840 )
841 {
842 int8 shiftCount;
843
844 if ( aSig0 == 0 ) {
845 shiftCount = countLeadingZeros64( aSig1 ) - 15;
846 if ( shiftCount < 0 ) {
847 *zSig0Ptr = aSig1>>( - shiftCount );
848 *zSig1Ptr = aSig1<<( shiftCount & 63 );
849 }
850 else {
851 *zSig0Ptr = aSig1<<shiftCount;
852 *zSig1Ptr = 0;
853 }
854 *zExpPtr = - shiftCount - 63;
855 }
856 else {
857 shiftCount = countLeadingZeros64( aSig0 ) - 15;
858 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
859 *zExpPtr = 1 - shiftCount;
860 }
861
862 }
863
864 /*----------------------------------------------------------------------------
865 | Packs the sign `zSign', the exponent `zExp', and the significand formed
866 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
867 | floating-point value, returning the result. After being shifted into the
868 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
869 | added together to form the most significant 32 bits of the result. This
870 | means that any integer portion of `zSig0' will be added into the exponent.
871 | Since a properly normalized significand will have an integer portion equal
872 | to 1, the `zExp' input should be 1 less than the desired result exponent
873 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
874 | significand.
875 *----------------------------------------------------------------------------*/
876
877 INLINE float128
878 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
879 {
880 float128 z;
881
882 z.low = zSig1;
883 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
884 return z;
885
886 }
887
888 /*----------------------------------------------------------------------------
889 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
890 | and extended significand formed by the concatenation of `zSig0', `zSig1',
891 | and `zSig2', and returns the proper quadruple-precision floating-point value
892 | corresponding to the abstract input. Ordinarily, the abstract value is
893 | simply rounded and packed into the quadruple-precision format, with the
894 | inexact exception raised if the abstract input cannot be represented
895 | exactly. However, if the abstract value is too large, the overflow and
896 | inexact exceptions are raised and an infinity or maximal finite value is
897 | returned. If the abstract value is too small, the input value is rounded to
898 | a subnormal number, and the underflow and inexact exceptions are raised if
899 | the abstract input cannot be represented exactly as a subnormal quadruple-
900 | precision floating-point number.
901 | The input significand must be normalized or smaller. If the input
902 | significand is not normalized, `zExp' must be 0; in that case, the result
903 | returned is a subnormal number, and it must not require rounding. In the
904 | usual case that the input significand is normalized, `zExp' must be 1 less
905 | than the ``true'' floating-point exponent. The handling of underflow and
906 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
907 *----------------------------------------------------------------------------*/
908
909 static float128
910 roundAndPackFloat128(
911 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 )
912 {
913 int8 roundingMode;
914 flag roundNearestEven, increment, isTiny;
915
916 roundingMode = float_rounding_mode;
917 roundNearestEven = ( roundingMode == float_round_nearest_even );
918 increment = ( (sbits64) zSig2 < 0 );
919 if ( ! roundNearestEven ) {
920 if ( roundingMode == float_round_to_zero ) {
921 increment = 0;
922 }
923 else {
924 if ( zSign ) {
925 increment = ( roundingMode == float_round_down ) && zSig2;
926 }
927 else {
928 increment = ( roundingMode == float_round_up ) && zSig2;
929 }
930 }
931 }
932 if ( 0x7FFD <= (bits32) zExp ) {
933 if ( ( 0x7FFD < zExp )
934 || ( ( zExp == 0x7FFD )
935 && eq128(
936 LIT64( 0x0001FFFFFFFFFFFF ),
937 LIT64( 0xFFFFFFFFFFFFFFFF ),
938 zSig0,
939 zSig1
940 )
941 && increment
942 )
943 ) {
944 float_raise( float_flag_overflow | float_flag_inexact );
945 if ( ( roundingMode == float_round_to_zero )
946 || ( zSign && ( roundingMode == float_round_up ) )
947 || ( ! zSign && ( roundingMode == float_round_down ) )
948 ) {
949 return
950 packFloat128(
951 zSign,
952 0x7FFE,
953 LIT64( 0x0000FFFFFFFFFFFF ),
954 LIT64( 0xFFFFFFFFFFFFFFFF )
955 );
956 }
957 return packFloat128( zSign, 0x7FFF, 0, 0 );
958 }
959 if ( zExp < 0 ) {
960 isTiny =
961 ( float_detect_tininess == float_tininess_before_rounding )
962 || ( zExp < -1 )
963 || ! increment
964 || lt128(
965 zSig0,
966 zSig1,
967 LIT64( 0x0001FFFFFFFFFFFF ),
968 LIT64( 0xFFFFFFFFFFFFFFFF )
969 );
970 shift128ExtraRightJamming(
971 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
972 zExp = 0;
973 if ( isTiny && zSig2 ) float_raise( float_flag_underflow );
974 if ( roundNearestEven ) {
975 increment = ( (sbits64) zSig2 < 0 );
976 }
977 else {
978 if ( zSign ) {
979 increment = ( roundingMode == float_round_down ) && zSig2;
980 }
981 else {
982 increment = ( roundingMode == float_round_up ) && zSig2;
983 }
984 }
985 }
986 }
987 if ( zSig2 ) float_exception_flags |= float_flag_inexact;
988 if ( increment ) {
989 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
990 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
991 }
992 else {
993 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
994 }
995 return packFloat128( zSign, zExp, zSig0, zSig1 );
996
997 }
998
999 /*----------------------------------------------------------------------------
1000 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1001 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1002 | returns the proper quadruple-precision floating-point value corresponding
1003 | to the abstract input. This routine is just like `roundAndPackFloat128'
1004 | except that the input significand has fewer bits and does not have to be
1005 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1006 | point exponent.
1007 *----------------------------------------------------------------------------*/
1008
1009 static float128
1010 normalizeRoundAndPackFloat128(
1011 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
1012 {
1013 int8 shiftCount;
1014 bits64 zSig2;
1015
1016 if ( zSig0 == 0 ) {
1017 zSig0 = zSig1;
1018 zSig1 = 0;
1019 zExp -= 64;
1020 }
1021 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1022 if ( 0 <= shiftCount ) {
1023 zSig2 = 0;
1024 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1025 }
1026 else {
1027 shift128ExtraRightJamming(
1028 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1029 }
1030 zExp -= shiftCount;
1031 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
1032
1033 }
1034
1035 #endif
1036
1037 /*----------------------------------------------------------------------------
1038 | Returns the result of converting the 32-bit two's complement integer `a'
1039 | to the single-precision floating-point format. The conversion is performed
1040 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1041 *----------------------------------------------------------------------------*/
1042
1043 float32 int32_to_float32( int32 a )
1044 {
1045 flag zSign;
1046
1047 if ( a == 0 ) return 0;
1048 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1049 zSign = ( a < 0 );
1050 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a );
1051
1052 }
1053
1054 /*----------------------------------------------------------------------------
1055 | Returns the result of converting the 32-bit two's complement integer `a'
1056 | to the double-precision floating-point format. The conversion is performed
1057 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1058 *----------------------------------------------------------------------------*/
1059
1060 float64 int32_to_float64( int32 a )
1061 {
1062 flag zSign;
1063 uint32 absA;
1064 int8 shiftCount;
1065 bits64 zSig;
1066
1067 if ( a == 0 ) return 0;
1068 zSign = ( a < 0 );
1069 absA = zSign ? - a : a;
1070 shiftCount = countLeadingZeros32( absA ) + 21;
1071 zSig = absA;
1072 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1073
1074 }
1075
1076 #ifdef FLOATX80
1077
1078 /*----------------------------------------------------------------------------
1079 | Returns the result of converting the 32-bit two's complement integer `a'
1080 | to the extended double-precision floating-point format. The conversion
1081 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1082 | Arithmetic.
1083 *----------------------------------------------------------------------------*/
1084
1085 floatx80 int32_to_floatx80( int32 a )
1086 {
1087 flag zSign;
1088 uint32 absA;
1089 int8 shiftCount;
1090 bits64 zSig;
1091
1092 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1093 zSign = ( a < 0 );
1094 absA = zSign ? - a : a;
1095 shiftCount = countLeadingZeros32( absA ) + 32;
1096 zSig = absA;
1097 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1098
1099 }
1100
1101 #endif
1102
1103 #ifdef FLOAT128
1104
1105 /*----------------------------------------------------------------------------
1106 | Returns the result of converting the 32-bit two's complement integer `a' to
1107 | the quadruple-precision floating-point format. The conversion is performed
1108 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1109 *----------------------------------------------------------------------------*/
1110
1111 float128 int32_to_float128( int32 a )
1112 {
1113 flag zSign;
1114 uint32 absA;
1115 int8 shiftCount;
1116 bits64 zSig0;
1117
1118 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1119 zSign = ( a < 0 );
1120 absA = zSign ? - a : a;
1121 shiftCount = countLeadingZeros32( absA ) + 17;
1122 zSig0 = absA;
1123 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1124
1125 }
1126
1127 #endif
1128
1129 /*----------------------------------------------------------------------------
1130 | Returns the result of converting the 64-bit two's complement integer `a'
1131 | to the single-precision floating-point format. The conversion is performed
1132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1133 *----------------------------------------------------------------------------*/
1134
1135 float32 int64_to_float32( int64 a )
1136 {
1137 flag zSign;
1138 uint64 absA;
1139 int8 shiftCount;
1140 bits32 zSig;
1141
1142 if ( a == 0 ) return 0;
1143 zSign = ( a < 0 );
1144 absA = zSign ? - a : a;
1145 shiftCount = countLeadingZeros64( absA ) - 40;
1146 if ( 0 <= shiftCount ) {
1147 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1148 }
1149 else {
1150 shiftCount += 7;
1151 if ( shiftCount < 0 ) {
1152 shift64RightJamming( absA, - shiftCount, &absA );
1153 }
1154 else {
1155 absA <<= shiftCount;
1156 }
1157 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA );
1158 }
1159
1160 }
1161
1162 /*----------------------------------------------------------------------------
1163 | Returns the result of converting the 64-bit two's complement integer `a'
1164 | to the double-precision floating-point format. The conversion is performed
1165 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1166 *----------------------------------------------------------------------------*/
1167
1168 float64 int64_to_float64( int64 a )
1169 {
1170 flag zSign;
1171
1172 if ( a == 0 ) return 0;
1173 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
1174 return packFloat64( 1, 0x43E, 0 );
1175 }
1176 zSign = ( a < 0 );
1177 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a );
1178
1179 }
1180
1181 #ifdef FLOATX80
1182
1183 /*----------------------------------------------------------------------------
1184 | Returns the result of converting the 64-bit two's complement integer `a'
1185 | to the extended double-precision floating-point format. The conversion
1186 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1187 | Arithmetic.
1188 *----------------------------------------------------------------------------*/
1189
1190 floatx80 int64_to_floatx80( int64 a )
1191 {
1192 flag zSign;
1193 uint64 absA;
1194 int8 shiftCount;
1195
1196 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1197 zSign = ( a < 0 );
1198 absA = zSign ? - a : a;
1199 shiftCount = countLeadingZeros64( absA );
1200 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1201
1202 }
1203
1204 #endif
1205
1206 #ifdef FLOAT128
1207
1208 /*----------------------------------------------------------------------------
1209 | Returns the result of converting the 64-bit two's complement integer `a' to
1210 | the quadruple-precision floating-point format. The conversion is performed
1211 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1212 *----------------------------------------------------------------------------*/
1213
1214 float128 int64_to_float128( int64 a )
1215 {
1216 flag zSign;
1217 uint64 absA;
1218 int8 shiftCount;
1219 int32 zExp;
1220 bits64 zSig0, zSig1;
1221
1222 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1223 zSign = ( a < 0 );
1224 absA = zSign ? - a : a;
1225 shiftCount = countLeadingZeros64( absA ) + 49;
1226 zExp = 0x406E - shiftCount;
1227 if ( 64 <= shiftCount ) {
1228 zSig1 = 0;
1229 zSig0 = absA;
1230 shiftCount -= 64;
1231 }
1232 else {
1233 zSig1 = absA;
1234 zSig0 = 0;
1235 }
1236 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1237 return packFloat128( zSign, zExp, zSig0, zSig1 );
1238
1239 }
1240
1241 #endif
1242
1243 /*----------------------------------------------------------------------------
1244 | Returns the result of converting the single-precision floating-point value
1245 | `a' to the 32-bit two's complement integer format. The conversion is
1246 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1247 | Arithmetic---which means in particular that the conversion is rounded
1248 | according to the current rounding mode. If `a' is a NaN, the largest
1249 | positive integer is returned. Otherwise, if the conversion overflows, the
1250 | largest integer with the same sign as `a' is returned.
1251 *----------------------------------------------------------------------------*/
1252
1253 int32 float32_to_int32( float32 a )
1254 {
1255 flag aSign;
1256 int16 aExp, shiftCount;
1257 bits32 aSig;
1258 bits64 aSig64;
1259
1260 aSig = extractFloat32Frac( a );
1261 aExp = extractFloat32Exp( a );
1262 aSign = extractFloat32Sign( a );
1263 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1264 if ( aExp ) aSig |= 0x00800000;
1265 shiftCount = 0xAF - aExp;
1266 aSig64 = aSig;
1267 aSig64 <<= 32;
1268 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1269 return roundAndPackInt32( aSign, aSig64 );
1270
1271 }
1272
1273 /*----------------------------------------------------------------------------
1274 | Returns the result of converting the single-precision floating-point value
1275 | `a' to the 32-bit two's complement integer format. The conversion is
1276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1277 | Arithmetic, except that the conversion is always rounded toward zero.
1278 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1279 | the conversion overflows, the largest integer with the same sign as `a' is
1280 | returned.
1281 *----------------------------------------------------------------------------*/
1282
1283 int32 float32_to_int32_round_to_zero( float32 a )
1284 {
1285 flag aSign;
1286 int16 aExp, shiftCount;
1287 bits32 aSig;
1288 int32 z;
1289
1290 aSig = extractFloat32Frac( a );
1291 aExp = extractFloat32Exp( a );
1292 aSign = extractFloat32Sign( a );
1293 shiftCount = aExp - 0x9E;
1294 if ( 0 <= shiftCount ) {
1295 if ( a != 0xCF000000 ) {
1296 float_raise( float_flag_invalid );
1297 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1298 }
1299 return (sbits32) 0x80000000;
1300 }
1301 else if ( aExp <= 0x7E ) {
1302 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1303 return 0;
1304 }
1305 aSig = ( aSig | 0x00800000 )<<8;
1306 z = aSig>>( - shiftCount );
1307 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
1308 float_exception_flags |= float_flag_inexact;
1309 }
1310 if ( aSign ) z = - z;
1311 return z;
1312
1313 }
1314
1315 /*----------------------------------------------------------------------------
1316 | Returns the result of converting the single-precision floating-point value
1317 | `a' to the 64-bit two's complement integer format. The conversion is
1318 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1319 | Arithmetic---which means in particular that the conversion is rounded
1320 | according to the current rounding mode. If `a' is a NaN, the largest
1321 | positive integer is returned. Otherwise, if the conversion overflows, the
1322 | largest integer with the same sign as `a' is returned.
1323 *----------------------------------------------------------------------------*/
1324
1325 int64 float32_to_int64( float32 a )
1326 {
1327 flag aSign;
1328 int16 aExp, shiftCount;
1329 bits32 aSig;
1330 bits64 aSig64, aSigExtra;
1331
1332 aSig = extractFloat32Frac( a );
1333 aExp = extractFloat32Exp( a );
1334 aSign = extractFloat32Sign( a );
1335 shiftCount = 0xBE - aExp;
1336 if ( shiftCount < 0 ) {
1337 float_raise( float_flag_invalid );
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1339 return LIT64( 0x7FFFFFFFFFFFFFFF );
1340 }
1341 return (sbits64) LIT64( 0x8000000000000000 );
1342 }
1343 if ( aExp ) aSig |= 0x00800000;
1344 aSig64 = aSig;
1345 aSig64 <<= 40;
1346 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1347 return roundAndPackInt64( aSign, aSig64, aSigExtra );
1348
1349 }
1350
1351 /*----------------------------------------------------------------------------
1352 | Returns the result of converting the single-precision floating-point value
1353 | `a' to the 64-bit two's complement integer format. The conversion is
1354 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1355 | Arithmetic, except that the conversion is always rounded toward zero. If
1356 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1357 | conversion overflows, the largest integer with the same sign as `a' is
1358 | returned.
1359 *----------------------------------------------------------------------------*/
1360
1361 int64 float32_to_int64_round_to_zero( float32 a )
1362 {
1363 flag aSign;
1364 int16 aExp, shiftCount;
1365 bits32 aSig;
1366 bits64 aSig64;
1367 int64 z;
1368
1369 aSig = extractFloat32Frac( a );
1370 aExp = extractFloat32Exp( a );
1371 aSign = extractFloat32Sign( a );
1372 shiftCount = aExp - 0xBE;
1373 if ( 0 <= shiftCount ) {
1374 if ( a != 0xDF000000 ) {
1375 float_raise( float_flag_invalid );
1376 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1377 return LIT64( 0x7FFFFFFFFFFFFFFF );
1378 }
1379 }
1380 return (sbits64) LIT64( 0x8000000000000000 );
1381 }
1382 else if ( aExp <= 0x7E ) {
1383 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
1384 return 0;
1385 }
1386 aSig64 = aSig | 0x00800000;
1387 aSig64 <<= 40;
1388 z = aSig64>>( - shiftCount );
1389 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
1390 float_exception_flags |= float_flag_inexact;
1391 }
1392 if ( aSign ) z = - z;
1393 return z;
1394
1395 }
1396
1397 /*----------------------------------------------------------------------------
1398 | Returns the result of converting the single-precision floating-point value
1399 | `a' to the double-precision floating-point format. The conversion is
1400 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1401 | Arithmetic.
1402 *----------------------------------------------------------------------------*/
1403
1404 float64 float32_to_float64( float32 a )
1405 {
1406 flag aSign;
1407 int16 aExp;
1408 bits32 aSig;
1409
1410 aSig = extractFloat32Frac( a );
1411 aExp = extractFloat32Exp( a );
1412 aSign = extractFloat32Sign( a );
1413 if ( aExp == 0xFF ) {
1414 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a ) );
1415 return packFloat64( aSign, 0x7FF, 0 );
1416 }
1417 if ( aExp == 0 ) {
1418 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1419 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1420 --aExp;
1421 }
1422 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
1423
1424 }
1425
1426 #ifdef FLOATX80
1427
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the single-precision floating-point value
1430 | `a' to the extended double-precision floating-point format. The conversion
1431 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1432 | Arithmetic.
1433 *----------------------------------------------------------------------------*/
1434
1435 floatx80 float32_to_floatx80( float32 a )
1436 {
1437 flag aSign;
1438 int16 aExp;
1439 bits32 aSig;
1440
1441 aSig = extractFloat32Frac( a );
1442 aExp = extractFloat32Exp( a );
1443 aSign = extractFloat32Sign( a );
1444 if ( aExp == 0xFF ) {
1445 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a ) );
1446 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1447 }
1448 if ( aExp == 0 ) {
1449 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1450 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1451 }
1452 aSig |= 0x00800000;
1453 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
1454
1455 }
1456
1457 #endif
1458
1459 #ifdef FLOAT128
1460
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the single-precision floating-point value
1463 | `a' to the double-precision floating-point format. The conversion is
1464 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1465 | Arithmetic.
1466 *----------------------------------------------------------------------------*/
1467
1468 float128 float32_to_float128( float32 a )
1469 {
1470 flag aSign;
1471 int16 aExp;
1472 bits32 aSig;
1473
1474 aSig = extractFloat32Frac( a );
1475 aExp = extractFloat32Exp( a );
1476 aSign = extractFloat32Sign( a );
1477 if ( aExp == 0xFF ) {
1478 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a ) );
1479 return packFloat128( aSign, 0x7FFF, 0, 0 );
1480 }
1481 if ( aExp == 0 ) {
1482 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1483 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1484 --aExp;
1485 }
1486 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
1487
1488 }
1489
1490 #endif
1491
1492 /*----------------------------------------------------------------------------
1493 | Rounds the single-precision floating-point value `a' to an integer, and
1494 | returns the result as a single-precision floating-point value. The
1495 | operation is performed according to the IEC/IEEE Standard for Binary
1496 | Floating-Point Arithmetic.
1497 *----------------------------------------------------------------------------*/
1498
1499 float32 float32_round_to_int( float32 a )
1500 {
1501 flag aSign;
1502 int16 aExp;
1503 bits32 lastBitMask, roundBitsMask;
1504 int8 roundingMode;
1505 float32 z;
1506
1507 aExp = extractFloat32Exp( a );
1508 if ( 0x96 <= aExp ) {
1509 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1510 return propagateFloat32NaN( a, a );
1511 }
1512 return a;
1513 }
1514 if ( aExp <= 0x7E ) {
1515 if ( (bits32) ( a<<1 ) == 0 ) return a;
1516 float_exception_flags |= float_flag_inexact;
1517 aSign = extractFloat32Sign( a );
1518 switch ( float_rounding_mode ) {
1519 case float_round_nearest_even:
1520 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1521 return packFloat32( aSign, 0x7F, 0 );
1522 }
1523 break;
1524 case float_round_down:
1525 return aSign ? 0xBF800000 : 0;
1526 case float_round_up:
1527 return aSign ? 0x80000000 : 0x3F800000;
1528 }
1529 return packFloat32( aSign, 0, 0 );
1530 }
1531 lastBitMask = 1;
1532 lastBitMask <<= 0x96 - aExp;
1533 roundBitsMask = lastBitMask - 1;
1534 z = a;
1535 roundingMode = float_rounding_mode;
1536 if ( roundingMode == float_round_nearest_even ) {
1537 z += lastBitMask>>1;
1538 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
1539 }
1540 else if ( roundingMode != float_round_to_zero ) {
1541 if ( extractFloat32Sign( z ) ^ ( roundingMode == float_round_up ) ) {
1542 z += roundBitsMask;
1543 }
1544 }
1545 z &= ~ roundBitsMask;
1546 if ( z != a ) float_exception_flags |= float_flag_inexact;
1547 return z;
1548
1549 }
1550
1551 /*----------------------------------------------------------------------------
1552 | Returns the result of adding the absolute values of the single-precision
1553 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1554 | before being returned. `zSign' is ignored if the result is a NaN.
1555 | The addition is performed according to the IEC/IEEE Standard for Binary
1556 | Floating-Point Arithmetic.
1557 *----------------------------------------------------------------------------*/
1558
1559 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign )
1560 {
1561 int16 aExp, bExp, zExp;
1562 bits32 aSig, bSig, zSig;
1563 int16 expDiff;
1564
1565 aSig = extractFloat32Frac( a );
1566 aExp = extractFloat32Exp( a );
1567 bSig = extractFloat32Frac( b );
1568 bExp = extractFloat32Exp( b );
1569 expDiff = aExp - bExp;
1570 aSig <<= 6;
1571 bSig <<= 6;
1572 if ( 0 < expDiff ) {
1573 if ( aExp == 0xFF ) {
1574 if ( aSig ) return propagateFloat32NaN( a, b );
1575 return a;
1576 }
1577 if ( bExp == 0 ) {
1578 --expDiff;
1579 }
1580 else {
1581 bSig |= 0x20000000;
1582 }
1583 shift32RightJamming( bSig, expDiff, &bSig );
1584 zExp = aExp;
1585 }
1586 else if ( expDiff < 0 ) {
1587 if ( bExp == 0xFF ) {
1588 if ( bSig ) return propagateFloat32NaN( a, b );
1589 return packFloat32( zSign, 0xFF, 0 );
1590 }
1591 if ( aExp == 0 ) {
1592 ++expDiff;
1593 }
1594 else {
1595 aSig |= 0x20000000;
1596 }
1597 shift32RightJamming( aSig, - expDiff, &aSig );
1598 zExp = bExp;
1599 }
1600 else {
1601 if ( aExp == 0xFF ) {
1602 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1603 return a;
1604 }
1605 if ( aExp == 0 ) return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
1606 zSig = 0x40000000 + aSig + bSig;
1607 zExp = aExp;
1608 goto roundAndPack;
1609 }
1610 aSig |= 0x20000000;
1611 zSig = ( aSig + bSig )<<1;
1612 --zExp;
1613 if ( (sbits32) zSig < 0 ) {
1614 zSig = aSig + bSig;
1615 ++zExp;
1616 }
1617 roundAndPack:
1618 return roundAndPackFloat32( zSign, zExp, zSig );
1619
1620 }
1621
1622 /*----------------------------------------------------------------------------
1623 | Returns the result of subtracting the absolute values of the single-
1624 | precision floating-point values `a' and `b'. If `zSign' is 1, the
1625 | difference is negated before being returned. `zSign' is ignored if the
1626 | result is a NaN. The subtraction is performed according to the IEC/IEEE
1627 | Standard for Binary Floating-Point Arithmetic.
1628 *----------------------------------------------------------------------------*/
1629
1630 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign )
1631 {
1632 int16 aExp, bExp, zExp;
1633 bits32 aSig, bSig, zSig;
1634 int16 expDiff;
1635
1636 aSig = extractFloat32Frac( a );
1637 aExp = extractFloat32Exp( a );
1638 bSig = extractFloat32Frac( b );
1639 bExp = extractFloat32Exp( b );
1640 expDiff = aExp - bExp;
1641 aSig <<= 7;
1642 bSig <<= 7;
1643 if ( 0 < expDiff ) goto aExpBigger;
1644 if ( expDiff < 0 ) goto bExpBigger;
1645 if ( aExp == 0xFF ) {
1646 if ( aSig | bSig ) return propagateFloat32NaN( a, b );
1647 float_raise( float_flag_invalid );
1648 return float32_default_nan;
1649 }
1650 if ( aExp == 0 ) {
1651 aExp = 1;
1652 bExp = 1;
1653 }
1654 if ( bSig < aSig ) goto aBigger;
1655 if ( aSig < bSig ) goto bBigger;
1656 return packFloat32( float_rounding_mode == float_round_down, 0, 0 );
1657 bExpBigger:
1658 if ( bExp == 0xFF ) {
1659 if ( bSig ) return propagateFloat32NaN( a, b );
1660 return packFloat32( zSign ^ 1, 0xFF, 0 );
1661 }
1662 if ( aExp == 0 ) {
1663 ++expDiff;
1664 }
1665 else {
1666 aSig |= 0x40000000;
1667 }
1668 shift32RightJamming( aSig, - expDiff, &aSig );
1669 bSig |= 0x40000000;
1670 bBigger:
1671 zSig = bSig - aSig;
1672 zExp = bExp;
1673 zSign ^= 1;
1674 goto normalizeRoundAndPack;
1675 aExpBigger:
1676 if ( aExp == 0xFF ) {
1677 if ( aSig ) return propagateFloat32NaN( a, b );
1678 return a;
1679 }
1680 if ( bExp == 0 ) {
1681 --expDiff;
1682 }
1683 else {
1684 bSig |= 0x40000000;
1685 }
1686 shift32RightJamming( bSig, expDiff, &bSig );
1687 aSig |= 0x40000000;
1688 aBigger:
1689 zSig = aSig - bSig;
1690 zExp = aExp;
1691 normalizeRoundAndPack:
1692 --zExp;
1693 return normalizeRoundAndPackFloat32( zSign, zExp, zSig );
1694
1695 }
1696
1697 /*----------------------------------------------------------------------------
1698 | Returns the result of adding the single-precision floating-point values `a'
1699 | and `b'. The operation is performed according to the IEC/IEEE Standard for
1700 | Binary Floating-Point Arithmetic.
1701 *----------------------------------------------------------------------------*/
1702
1703 float32 float32_add( float32 a, float32 b )
1704 {
1705 flag aSign, bSign;
1706
1707 aSign = extractFloat32Sign( a );
1708 bSign = extractFloat32Sign( b );
1709 if ( aSign == bSign ) {
1710 return addFloat32Sigs( a, b, aSign );
1711 }
1712 else {
1713 return subFloat32Sigs( a, b, aSign );
1714 }
1715
1716 }
1717
1718 /*----------------------------------------------------------------------------
1719 | Returns the result of subtracting the single-precision floating-point values
1720 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1721 | for Binary Floating-Point Arithmetic.
1722 *----------------------------------------------------------------------------*/
1723
1724 float32 float32_sub( float32 a, float32 b )
1725 {
1726 flag aSign, bSign;
1727
1728 aSign = extractFloat32Sign( a );
1729 bSign = extractFloat32Sign( b );
1730 if ( aSign == bSign ) {
1731 return subFloat32Sigs( a, b, aSign );
1732 }
1733 else {
1734 return addFloat32Sigs( a, b, aSign );
1735 }
1736
1737 }
1738
1739 /*----------------------------------------------------------------------------
1740 | Returns the result of multiplying the single-precision floating-point values
1741 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
1742 | for Binary Floating-Point Arithmetic.
1743 *----------------------------------------------------------------------------*/
1744
1745 float32 float32_mul( float32 a, float32 b )
1746 {
1747 flag aSign, bSign, zSign;
1748 int16 aExp, bExp, zExp;
1749 bits32 aSig, bSig;
1750 bits64 zSig64;
1751 bits32 zSig;
1752
1753 aSig = extractFloat32Frac( a );
1754 aExp = extractFloat32Exp( a );
1755 aSign = extractFloat32Sign( a );
1756 bSig = extractFloat32Frac( b );
1757 bExp = extractFloat32Exp( b );
1758 bSign = extractFloat32Sign( b );
1759 zSign = aSign ^ bSign;
1760 if ( aExp == 0xFF ) {
1761 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1762 return propagateFloat32NaN( a, b );
1763 }
1764 if ( ( bExp | bSig ) == 0 ) {
1765 float_raise( float_flag_invalid );
1766 return float32_default_nan;
1767 }
1768 return packFloat32( zSign, 0xFF, 0 );
1769 }
1770 if ( bExp == 0xFF ) {
1771 if ( bSig ) return propagateFloat32NaN( a, b );
1772 if ( ( aExp | aSig ) == 0 ) {
1773 float_raise( float_flag_invalid );
1774 return float32_default_nan;
1775 }
1776 return packFloat32( zSign, 0xFF, 0 );
1777 }
1778 if ( aExp == 0 ) {
1779 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1780 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1781 }
1782 if ( bExp == 0 ) {
1783 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
1784 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1785 }
1786 zExp = aExp + bExp - 0x7F;
1787 aSig = ( aSig | 0x00800000 )<<7;
1788 bSig = ( bSig | 0x00800000 )<<8;
1789 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
1790 zSig = zSig64;
1791 if ( 0 <= (sbits32) ( zSig<<1 ) ) {
1792 zSig <<= 1;
1793 --zExp;
1794 }
1795 return roundAndPackFloat32( zSign, zExp, zSig );
1796
1797 }
1798
1799 /*----------------------------------------------------------------------------
1800 | Returns the result of dividing the single-precision floating-point value `a'
1801 | by the corresponding value `b'. The operation is performed according to the
1802 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1803 *----------------------------------------------------------------------------*/
1804
1805 float32 float32_div( float32 a, float32 b )
1806 {
1807 flag aSign, bSign, zSign;
1808 int16 aExp, bExp, zExp;
1809 bits32 aSig, bSig, zSig;
1810
1811 aSig = extractFloat32Frac( a );
1812 aExp = extractFloat32Exp( a );
1813 aSign = extractFloat32Sign( a );
1814 bSig = extractFloat32Frac( b );
1815 bExp = extractFloat32Exp( b );
1816 bSign = extractFloat32Sign( b );
1817 zSign = aSign ^ bSign;
1818 if ( aExp == 0xFF ) {
1819 if ( aSig ) return propagateFloat32NaN( a, b );
1820 if ( bExp == 0xFF ) {
1821 if ( bSig ) return propagateFloat32NaN( a, b );
1822 float_raise( float_flag_invalid );
1823 return float32_default_nan;
1824 }
1825 return packFloat32( zSign, 0xFF, 0 );
1826 }
1827 if ( bExp == 0xFF ) {
1828 if ( bSig ) return propagateFloat32NaN( a, b );
1829 return packFloat32( zSign, 0, 0 );
1830 }
1831 if ( bExp == 0 ) {
1832 if ( bSig == 0 ) {
1833 if ( ( aExp | aSig ) == 0 ) {
1834 float_raise( float_flag_invalid );
1835 return float32_default_nan;
1836 }
1837 float_raise( float_flag_divbyzero );
1838 return packFloat32( zSign, 0xFF, 0 );
1839 }
1840 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1841 }
1842 if ( aExp == 0 ) {
1843 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
1844 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1845 }
1846 zExp = aExp - bExp + 0x7D;
1847 aSig = ( aSig | 0x00800000 )<<7;
1848 bSig = ( bSig | 0x00800000 )<<8;
1849 if ( bSig <= ( aSig + aSig ) ) {
1850 aSig >>= 1;
1851 ++zExp;
1852 }
1853 zSig = ( ( (bits64) aSig )<<32 ) / bSig;
1854 if ( ( zSig & 0x3F ) == 0 ) {
1855 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
1856 }
1857 return roundAndPackFloat32( zSign, zExp, zSig );
1858
1859 }
1860
1861 /*----------------------------------------------------------------------------
1862 | Returns the remainder of the single-precision floating-point value `a'
1863 | with respect to the corresponding value `b'. The operation is performed
1864 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1865 *----------------------------------------------------------------------------*/
1866
1867 float32 float32_rem( float32 a, float32 b )
1868 {
1869 flag aSign, bSign, zSign;
1870 int16 aExp, bExp, expDiff;
1871 bits32 aSig, bSig;
1872 bits32 q;
1873 bits64 aSig64, bSig64, q64;
1874 bits32 alternateASig;
1875 sbits32 sigMean;
1876
1877 aSig = extractFloat32Frac( a );
1878 aExp = extractFloat32Exp( a );
1879 aSign = extractFloat32Sign( a );
1880 bSig = extractFloat32Frac( b );
1881 bExp = extractFloat32Exp( b );
1882 bSign = extractFloat32Sign( b );
1883 if ( aExp == 0xFF ) {
1884 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
1885 return propagateFloat32NaN( a, b );
1886 }
1887 float_raise( float_flag_invalid );
1888 return float32_default_nan;
1889 }
1890 if ( bExp == 0xFF ) {
1891 if ( bSig ) return propagateFloat32NaN( a, b );
1892 return a;
1893 }
1894 if ( bExp == 0 ) {
1895 if ( bSig == 0 ) {
1896 float_raise( float_flag_invalid );
1897 return float32_default_nan;
1898 }
1899 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
1900 }
1901 if ( aExp == 0 ) {
1902 if ( aSig == 0 ) return a;
1903 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1904 }
1905 expDiff = aExp - bExp;
1906 aSig |= 0x00800000;
1907 bSig |= 0x00800000;
1908 if ( expDiff < 32 ) {
1909 aSig <<= 8;
1910 bSig <<= 8;
1911 if ( expDiff < 0 ) {
1912 if ( expDiff < -1 ) return a;
1913 aSig >>= 1;
1914 }
1915 q = ( bSig <= aSig );
1916 if ( q ) aSig -= bSig;
1917 if ( 0 < expDiff ) {
1918 q = ( ( (bits64) aSig )<<32 ) / bSig;
1919 q >>= 32 - expDiff;
1920 bSig >>= 2;
1921 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
1922 }
1923 else {
1924 aSig >>= 2;
1925 bSig >>= 2;
1926 }
1927 }
1928 else {
1929 if ( bSig <= aSig ) aSig -= bSig;
1930 aSig64 = ( (bits64) aSig )<<40;
1931 bSig64 = ( (bits64) bSig )<<40;
1932 expDiff -= 64;
1933 while ( 0 < expDiff ) {
1934 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1935 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1936 aSig64 = - ( ( bSig * q64 )<<38 );
1937 expDiff -= 62;
1938 }
1939 expDiff += 64;
1940 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
1941 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
1942 q = q64>>( 64 - expDiff );
1943 bSig <<= 6;
1944 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
1945 }
1946 do {
1947 alternateASig = aSig;
1948 ++q;
1949 aSig -= bSig;
1950 } while ( 0 <= (sbits32) aSig );
1951 sigMean = aSig + alternateASig;
1952 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
1953 aSig = alternateASig;
1954 }
1955 zSign = ( (sbits32) aSig < 0 );
1956 if ( zSign ) aSig = - aSig;
1957 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig );
1958
1959 }
1960
1961 /*----------------------------------------------------------------------------
1962 | Returns the square root of the single-precision floating-point value `a'.
1963 | The operation is performed according to the IEC/IEEE Standard for Binary
1964 | Floating-Point Arithmetic.
1965 *----------------------------------------------------------------------------*/
1966
1967 float32 float32_sqrt( float32 a )
1968 {
1969 flag aSign;
1970 int16 aExp, zExp;
1971 bits32 aSig, zSig;
1972 bits64 rem, term;
1973
1974 aSig = extractFloat32Frac( a );
1975 aExp = extractFloat32Exp( a );
1976 aSign = extractFloat32Sign( a );
1977 if ( aExp == 0xFF ) {
1978 if ( aSig ) return propagateFloat32NaN( a, 0 );
1979 if ( ! aSign ) return a;
1980 float_raise( float_flag_invalid );
1981 return float32_default_nan;
1982 }
1983 if ( aSign ) {
1984 if ( ( aExp | aSig ) == 0 ) return a;
1985 float_raise( float_flag_invalid );
1986 return float32_default_nan;
1987 }
1988 if ( aExp == 0 ) {
1989 if ( aSig == 0 ) return 0;
1990 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1991 }
1992 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
1993 aSig = ( aSig | 0x00800000 )<<8;
1994 zSig = estimateSqrt32( aExp, aSig ) + 2;
1995 if ( ( zSig & 0x7F ) <= 5 ) {
1996 if ( zSig < 2 ) {
1997 zSig = 0x7FFFFFFF;
1998 goto roundAndPack;
1999 }
2000 aSig >>= aExp & 1;
2001 term = ( (bits64) zSig ) * zSig;
2002 rem = ( ( (bits64) aSig )<<32 ) - term;
2003 while ( (sbits64) rem < 0 ) {
2004 --zSig;
2005 rem += ( ( (bits64) zSig )<<1 ) | 1;
2006 }
2007 zSig |= ( rem != 0 );
2008 }
2009 shift32RightJamming( zSig, 1, &zSig );
2010 roundAndPack:
2011 return roundAndPackFloat32( 0, zExp, zSig );
2012
2013 }
2014
2015 /*----------------------------------------------------------------------------
2016 | Returns 1 if the single-precision floating-point value `a' is equal to
2017 | the corresponding value `b', and 0 otherwise. The comparison is performed
2018 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2019 *----------------------------------------------------------------------------*/
2020
2021 flag float32_eq( float32 a, float32 b )
2022 {
2023
2024 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2025 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2026 ) {
2027 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2028 float_raise( float_flag_invalid );
2029 }
2030 return 0;
2031 }
2032 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2033
2034 }
2035
2036 /*----------------------------------------------------------------------------
2037 | Returns 1 if the single-precision floating-point value `a' is less than
2038 | or equal to the corresponding value `b', and 0 otherwise. The comparison
2039 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2040 | Arithmetic.
2041 *----------------------------------------------------------------------------*/
2042
2043 flag float32_le( float32 a, float32 b )
2044 {
2045 flag aSign, bSign;
2046
2047 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2048 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2049 ) {
2050 float_raise( float_flag_invalid );
2051 return 0;
2052 }
2053 aSign = extractFloat32Sign( a );
2054 bSign = extractFloat32Sign( b );
2055 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2056 return ( a == b ) || ( aSign ^ ( a < b ) );
2057
2058 }
2059
2060 /*----------------------------------------------------------------------------
2061 | Returns 1 if the single-precision floating-point value `a' is less than
2062 | the corresponding value `b', and 0 otherwise. The comparison is performed
2063 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2064 *----------------------------------------------------------------------------*/
2065
2066 flag float32_lt( float32 a, float32 b )
2067 {
2068 flag aSign, bSign;
2069
2070 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2071 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2072 ) {
2073 float_raise( float_flag_invalid );
2074 return 0;
2075 }
2076 aSign = extractFloat32Sign( a );
2077 bSign = extractFloat32Sign( b );
2078 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2079 return ( a != b ) && ( aSign ^ ( a < b ) );
2080
2081 }
2082
2083 /*----------------------------------------------------------------------------
2084 | Returns 1 if the single-precision floating-point value `a' is equal to
2085 | the corresponding value `b', and 0 otherwise. The invalid exception is
2086 | raised if either operand is a NaN. Otherwise, the comparison is performed
2087 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2088 *----------------------------------------------------------------------------*/
2089
2090 flag float32_eq_signaling( float32 a, float32 b )
2091 {
2092
2093 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2094 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2095 ) {
2096 float_raise( float_flag_invalid );
2097 return 0;
2098 }
2099 return ( a == b ) || ( (bits32) ( ( a | b )<<1 ) == 0 );
2100
2101 }
2102
2103 /*----------------------------------------------------------------------------
2104 | Returns 1 if the single-precision floating-point value `a' is less than or
2105 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2106 | cause an exception. Otherwise, the comparison is performed according to the
2107 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2108 *----------------------------------------------------------------------------*/
2109
2110 flag float32_le_quiet( float32 a, float32 b )
2111 {
2112 flag aSign, bSign;
2113 int16 aExp, bExp;
2114
2115 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2116 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2117 ) {
2118 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2119 float_raise( float_flag_invalid );
2120 }
2121 return 0;
2122 }
2123 aSign = extractFloat32Sign( a );
2124 bSign = extractFloat32Sign( b );
2125 if ( aSign != bSign ) return aSign || ( (bits32) ( ( a | b )<<1 ) == 0 );
2126 return ( a == b ) || ( aSign ^ ( a < b ) );
2127
2128 }
2129
2130 /*----------------------------------------------------------------------------
2131 | Returns 1 if the single-precision floating-point value `a' is less than
2132 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2133 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
2134 | Standard for Binary Floating-Point Arithmetic.
2135 *----------------------------------------------------------------------------*/
2136
2137 flag float32_lt_quiet( float32 a, float32 b )
2138 {
2139 flag aSign, bSign;
2140
2141 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2142 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2143 ) {
2144 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2145 float_raise( float_flag_invalid );
2146 }
2147 return 0;
2148 }
2149 aSign = extractFloat32Sign( a );
2150 bSign = extractFloat32Sign( b );
2151 if ( aSign != bSign ) return aSign && ( (bits32) ( ( a | b )<<1 ) != 0 );
2152 return ( a != b ) && ( aSign ^ ( a < b ) );
2153
2154 }
2155
2156 /*----------------------------------------------------------------------------
2157 | Returns the result of converting the double-precision floating-point value
2158 | `a' to the 32-bit two's complement integer format. The conversion is
2159 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2160 | Arithmetic---which means in particular that the conversion is rounded
2161 | according to the current rounding mode. If `a' is a NaN, the largest
2162 | positive integer is returned. Otherwise, if the conversion overflows, the
2163 | largest integer with the same sign as `a' is returned.
2164 *----------------------------------------------------------------------------*/
2165
2166 int32 float64_to_int32( float64 a )
2167 {
2168 flag aSign;
2169 int16 aExp, shiftCount;
2170 bits64 aSig;
2171
2172 aSig = extractFloat64Frac( a );
2173 aExp = extractFloat64Exp( a );
2174 aSign = extractFloat64Sign( a );
2175 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2176 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2177 shiftCount = 0x42C - aExp;
2178 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
2179 return roundAndPackInt32( aSign, aSig );
2180
2181 }
2182
2183 /*----------------------------------------------------------------------------
2184 | Returns the result of converting the double-precision floating-point value
2185 | `a' to the 32-bit two's complement integer format. The conversion is
2186 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2187 | Arithmetic, except that the conversion is always rounded toward zero.
2188 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2189 | the conversion overflows, the largest integer with the same sign as `a' is
2190 | returned.
2191 *----------------------------------------------------------------------------*/
2192
2193 int32 float64_to_int32_round_to_zero( float64 a )
2194 {
2195 flag aSign;
2196 int16 aExp, shiftCount;
2197 bits64 aSig, savedASig;
2198 int32 z;
2199
2200 aSig = extractFloat64Frac( a );
2201 aExp = extractFloat64Exp( a );
2202 aSign = extractFloat64Sign( a );
2203 if ( 0x41E < aExp ) {
2204 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
2205 goto invalid;
2206 }
2207 else if ( aExp < 0x3FF ) {
2208 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
2209 return 0;
2210 }
2211 aSig |= LIT64( 0x0010000000000000 );
2212 shiftCount = 0x433 - aExp;
2213 savedASig = aSig;
2214 aSig >>= shiftCount;
2215 z = aSig;
2216 if ( aSign ) z = - z;
2217 if ( ( z < 0 ) ^ aSign ) {
2218 invalid:
2219 float_raise( float_flag_invalid );
2220 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
2221 }
2222 if ( ( aSig<<shiftCount ) != savedASig ) {
2223 float_exception_flags |= float_flag_inexact;
2224 }
2225 return z;
2226
2227 }
2228
2229 /*----------------------------------------------------------------------------
2230 | Returns the result of converting the double-precision floating-point value
2231 | `a' to the 64-bit two's complement integer format. The conversion is
2232 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2233 | Arithmetic---which means in particular that the conversion is rounded
2234 | according to the current rounding mode. If `a' is a NaN, the largest
2235 | positive integer is returned. Otherwise, if the conversion overflows, the
2236 | largest integer with the same sign as `a' is returned.
2237 *----------------------------------------------------------------------------*/
2238
2239 int64 float64_to_int64( float64 a )
2240 {
2241 flag aSign;
2242 int16 aExp, shiftCount;
2243 bits64 aSig, aSigExtra;
2244
2245 aSig = extractFloat64Frac( a );
2246 aExp = extractFloat64Exp( a );
2247 aSign = extractFloat64Sign( a );
2248 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2249 shiftCount = 0x433 - aExp;
2250 if ( shiftCount <= 0 ) {
2251 if ( 0x43E < aExp ) {
2252 float_raise( float_flag_invalid );
2253 if ( ! aSign
2254 || ( ( aExp == 0x7FF )
2255 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2256 ) {
2257 return LIT64( 0x7FFFFFFFFFFFFFFF );
2258 }
2259 return (sbits64) LIT64( 0x8000000000000000 );
2260 }
2261 aSigExtra = 0;
2262 aSig <<= - shiftCount;
2263 }
2264 else {
2265 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
2266 }
2267 return roundAndPackInt64( aSign, aSig, aSigExtra );
2268
2269 }
2270
2271 /*----------------------------------------------------------------------------
2272 | Returns the result of converting the double-precision floating-point value
2273 | `a' to the 64-bit two's complement integer format. The conversion is
2274 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2275 | Arithmetic, except that the conversion is always rounded toward zero.
2276 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
2277 | the conversion overflows, the largest integer with the same sign as `a' is
2278 | returned.
2279 *----------------------------------------------------------------------------*/
2280
2281 int64 float64_to_int64_round_to_zero( float64 a )
2282 {
2283 flag aSign;
2284 int16 aExp, shiftCount;
2285 bits64 aSig;
2286 int64 z;
2287
2288 aSig = extractFloat64Frac( a );
2289 aExp = extractFloat64Exp( a );
2290 aSign = extractFloat64Sign( a );
2291 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
2292 shiftCount = aExp - 0x433;
2293 if ( 0 <= shiftCount ) {
2294 if ( 0x43E <= aExp ) {
2295 if ( a != LIT64( 0xC3E0000000000000 ) ) {
2296 float_raise( float_flag_invalid );
2297 if ( ! aSign
2298 || ( ( aExp == 0x7FF )
2299 && ( aSig != LIT64( 0x0010000000000000 ) ) )
2300 ) {
2301 return LIT64( 0x7FFFFFFFFFFFFFFF );
2302 }
2303 }
2304 return (sbits64) LIT64( 0x8000000000000000 );
2305 }
2306 z = aSig<<shiftCount;
2307 }
2308 else {
2309 if ( aExp < 0x3FE ) {
2310 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
2311 return 0;
2312 }
2313 z = aSig>>( - shiftCount );
2314 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
2315 float_exception_flags |= float_flag_inexact;
2316 }
2317 }
2318 if ( aSign ) z = - z;
2319 return z;
2320
2321 }
2322
2323 /*----------------------------------------------------------------------------
2324 | Returns the result of converting the double-precision floating-point value
2325 | `a' to the single-precision floating-point format. The conversion is
2326 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2327 | Arithmetic.
2328 *----------------------------------------------------------------------------*/
2329
2330 float32 float64_to_float32( float64 a )
2331 {
2332 flag aSign;
2333 int16 aExp;
2334 bits64 aSig;
2335 bits32 zSig;
2336
2337 aSig = extractFloat64Frac( a );
2338 aExp = extractFloat64Exp( a );
2339 aSign = extractFloat64Sign( a );
2340 if ( aExp == 0x7FF ) {
2341 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a ) );
2342 return packFloat32( aSign, 0xFF, 0 );
2343 }
2344 shift64RightJamming( aSig, 22, &aSig );
2345 zSig = aSig;
2346 if ( aExp || zSig ) {
2347 zSig |= 0x40000000;
2348 aExp -= 0x381;
2349 }
2350 return roundAndPackFloat32( aSign, aExp, zSig );
2351
2352 }
2353
2354 #ifdef FLOATX80
2355
2356 /*----------------------------------------------------------------------------
2357 | Returns the result of converting the double-precision floating-point value
2358 | `a' to the extended double-precision floating-point format. The conversion
2359 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
2360 | Arithmetic.
2361 *----------------------------------------------------------------------------*/
2362
2363 floatx80 float64_to_floatx80( float64 a )
2364 {
2365 flag aSign;
2366 int16 aExp;
2367 bits64 aSig;
2368
2369 aSig = extractFloat64Frac( a );
2370 aExp = extractFloat64Exp( a );
2371 aSign = extractFloat64Sign( a );
2372 if ( aExp == 0x7FF ) {
2373 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a ) );
2374 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
2375 }
2376 if ( aExp == 0 ) {
2377 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
2378 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2379 }
2380 return
2381 packFloatx80(
2382 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
2383
2384 }
2385
2386 #endif
2387
2388 #ifdef FLOAT128
2389
2390 /*----------------------------------------------------------------------------
2391 | Returns the result of converting the double-precision floating-point value
2392 | `a' to the quadruple-precision floating-point format. The conversion is
2393 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2394 | Arithmetic.
2395 *----------------------------------------------------------------------------*/
2396
2397 float128 float64_to_float128( float64 a )
2398 {
2399 flag aSign;
2400 int16 aExp;
2401 bits64 aSig, zSig0, zSig1;
2402
2403 aSig = extractFloat64Frac( a );
2404 aExp = extractFloat64Exp( a );
2405 aSign = extractFloat64Sign( a );
2406 if ( aExp == 0x7FF ) {
2407 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a ) );
2408 return packFloat128( aSign, 0x7FFF, 0, 0 );
2409 }
2410 if ( aExp == 0 ) {
2411 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
2412 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2413 --aExp;
2414 }
2415 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
2416 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
2417
2418 }
2419
2420 #endif
2421
2422 /*----------------------------------------------------------------------------
2423 | Rounds the double-precision floating-point value `a' to an integer, and
2424 | returns the result as a double-precision floating-point value. The
2425 | operation is performed according to the IEC/IEEE Standard for Binary
2426 | Floating-Point Arithmetic.
2427 *----------------------------------------------------------------------------*/
2428
2429 float64 float64_round_to_int( float64 a )
2430 {
2431 flag aSign;
2432 int16 aExp;
2433 bits64 lastBitMask, roundBitsMask;
2434 int8 roundingMode;
2435 float64 z;
2436
2437 aExp = extractFloat64Exp( a );
2438 if ( 0x433 <= aExp ) {
2439 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
2440 return propagateFloat64NaN( a, a );
2441 }
2442 return a;
2443 }
2444 if ( aExp < 0x3FF ) {
2445 if ( (bits64) ( a<<1 ) == 0 ) return a;
2446 float_exception_flags |= float_flag_inexact;
2447 aSign = extractFloat64Sign( a );
2448 switch ( float_rounding_mode ) {
2449 case float_round_nearest_even:
2450 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
2451 return packFloat64( aSign, 0x3FF, 0 );
2452 }
2453 break;
2454 case float_round_down:
2455 return aSign ? LIT64( 0xBFF0000000000000 ) : 0;
2456 case float_round_up:
2457 return
2458 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 );
2459 }
2460 return packFloat64( aSign, 0, 0 );
2461 }
2462 lastBitMask = 1;
2463 lastBitMask <<= 0x433 - aExp;
2464 roundBitsMask = lastBitMask - 1;
2465 z = a;
2466 roundingMode = float_rounding_mode;
2467 if ( roundingMode == float_round_nearest_even ) {
2468 z += lastBitMask>>1;
2469 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
2470 }
2471 else if ( roundingMode != float_round_to_zero ) {
2472 if ( extractFloat64Sign( z ) ^ ( roundingMode == float_round_up ) ) {
2473 z += roundBitsMask;
2474 }
2475 }
2476 z &= ~ roundBitsMask;
2477 if ( z != a ) float_exception_flags |= float_flag_inexact;
2478 return z;
2479
2480 }
2481
2482 /*----------------------------------------------------------------------------
2483 | Returns the result of adding the absolute values of the double-precision
2484 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
2485 | before being returned. `zSign' is ignored if the result is a NaN.
2486 | The addition is performed according to the IEC/IEEE Standard for Binary
2487 | Floating-Point Arithmetic.
2488 *----------------------------------------------------------------------------*/
2489
2490 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign )
2491 {
2492 int16 aExp, bExp, zExp;
2493 bits64 aSig, bSig, zSig;
2494 int16 expDiff;
2495
2496 aSig = extractFloat64Frac( a );
2497 aExp = extractFloat64Exp( a );
2498 bSig = extractFloat64Frac( b );
2499 bExp = extractFloat64Exp( b );
2500 expDiff = aExp - bExp;
2501 aSig <<= 9;
2502 bSig <<= 9;
2503 if ( 0 < expDiff ) {
2504 if ( aExp == 0x7FF ) {
2505 if ( aSig ) return propagateFloat64NaN( a, b );
2506 return a;
2507 }
2508 if ( bExp == 0 ) {
2509 --expDiff;
2510 }
2511 else {
2512 bSig |= LIT64( 0x2000000000000000 );
2513 }
2514 shift64RightJamming( bSig, expDiff, &bSig );
2515 zExp = aExp;
2516 }
2517 else if ( expDiff < 0 ) {
2518 if ( bExp == 0x7FF ) {
2519 if ( bSig ) return propagateFloat64NaN( a, b );
2520 return packFloat64( zSign, 0x7FF, 0 );
2521 }
2522 if ( aExp == 0 ) {
2523 ++expDiff;
2524 }
2525 else {
2526 aSig |= LIT64( 0x2000000000000000 );
2527 }
2528 shift64RightJamming( aSig, - expDiff, &aSig );
2529 zExp = bExp;
2530 }
2531 else {
2532 if ( aExp == 0x7FF ) {
2533 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2534 return a;
2535 }
2536 if ( aExp == 0 ) return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
2537 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
2538 zExp = aExp;
2539 goto roundAndPack;
2540 }
2541 aSig |= LIT64( 0x2000000000000000 );
2542 zSig = ( aSig + bSig )<<1;
2543 --zExp;
2544 if ( (sbits64) zSig < 0 ) {
2545 zSig = aSig + bSig;
2546 ++zExp;
2547 }
2548 roundAndPack:
2549 return roundAndPackFloat64( zSign, zExp, zSig );
2550
2551 }
2552
2553 /*----------------------------------------------------------------------------
2554 | Returns the result of subtracting the absolute values of the double-
2555 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2556 | difference is negated before being returned. `zSign' is ignored if the
2557 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2558 | Standard for Binary Floating-Point Arithmetic.
2559 *----------------------------------------------------------------------------*/
2560
2561 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign )
2562 {
2563 int16 aExp, bExp, zExp;
2564 bits64 aSig, bSig, zSig;
2565 int16 expDiff;
2566
2567 aSig = extractFloat64Frac( a );
2568 aExp = extractFloat64Exp( a );
2569 bSig = extractFloat64Frac( b );
2570 bExp = extractFloat64Exp( b );
2571 expDiff = aExp - bExp;
2572 aSig <<= 10;
2573 bSig <<= 10;
2574 if ( 0 < expDiff ) goto aExpBigger;
2575 if ( expDiff < 0 ) goto bExpBigger;
2576 if ( aExp == 0x7FF ) {
2577 if ( aSig | bSig ) return propagateFloat64NaN( a, b );
2578 float_raise( float_flag_invalid );
2579 return float64_default_nan;
2580 }
2581 if ( aExp == 0 ) {
2582 aExp = 1;
2583 bExp = 1;
2584 }
2585 if ( bSig < aSig ) goto aBigger;
2586 if ( aSig < bSig ) goto bBigger;
2587 return packFloat64( float_rounding_mode == float_round_down, 0, 0 );
2588 bExpBigger:
2589 if ( bExp == 0x7FF ) {
2590 if ( bSig ) return propagateFloat64NaN( a, b );
2591 return packFloat64( zSign ^ 1, 0x7FF, 0 );
2592 }
2593 if ( aExp == 0 ) {
2594 ++expDiff;
2595 }
2596 else {
2597 aSig |= LIT64( 0x4000000000000000 );
2598 }
2599 shift64RightJamming( aSig, - expDiff, &aSig );
2600 bSig |= LIT64( 0x4000000000000000 );
2601 bBigger:
2602 zSig = bSig - aSig;
2603 zExp = bExp;
2604 zSign ^= 1;
2605 goto normalizeRoundAndPack;
2606 aExpBigger:
2607 if ( aExp == 0x7FF ) {
2608 if ( aSig ) return propagateFloat64NaN( a, b );
2609 return a;
2610 }
2611 if ( bExp == 0 ) {
2612 --expDiff;
2613 }
2614 else {
2615 bSig |= LIT64( 0x4000000000000000 );
2616 }
2617 shift64RightJamming( bSig, expDiff, &bSig );
2618 aSig |= LIT64( 0x4000000000000000 );
2619 aBigger:
2620 zSig = aSig - bSig;
2621 zExp = aExp;
2622 normalizeRoundAndPack:
2623 --zExp;
2624 return normalizeRoundAndPackFloat64( zSign, zExp, zSig );
2625
2626 }
2627
2628 /*----------------------------------------------------------------------------
2629 | Returns the result of adding the double-precision floating-point values `a'
2630 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2631 | Binary Floating-Point Arithmetic.
2632 *----------------------------------------------------------------------------*/
2633
2634 float64 float64_add( float64 a, float64 b )
2635 {
2636 flag aSign, bSign;
2637
2638 aSign = extractFloat64Sign( a );
2639 bSign = extractFloat64Sign( b );
2640 if ( aSign == bSign ) {
2641 return addFloat64Sigs( a, b, aSign );
2642 }
2643 else {
2644 return subFloat64Sigs( a, b, aSign );
2645 }
2646
2647 }
2648
2649 /*----------------------------------------------------------------------------
2650 | Returns the result of subtracting the double-precision floating-point values
2651 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2652 | for Binary Floating-Point Arithmetic.
2653 *----------------------------------------------------------------------------*/
2654
2655 float64 float64_sub( float64 a, float64 b )
2656 {
2657 flag aSign, bSign;
2658
2659 aSign = extractFloat64Sign( a );
2660 bSign = extractFloat64Sign( b );
2661 if ( aSign == bSign ) {
2662 return subFloat64Sigs( a, b, aSign );
2663 }
2664 else {
2665 return addFloat64Sigs( a, b, aSign );
2666 }
2667
2668 }
2669
2670 /*----------------------------------------------------------------------------
2671 | Returns the result of multiplying the double-precision floating-point values
2672 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2673 | for Binary Floating-Point Arithmetic.
2674 *----------------------------------------------------------------------------*/
2675
2676 float64 float64_mul( float64 a, float64 b )
2677 {
2678 flag aSign, bSign, zSign;
2679 int16 aExp, bExp, zExp;
2680 bits64 aSig, bSig, zSig0, zSig1;
2681
2682 aSig = extractFloat64Frac( a );
2683 aExp = extractFloat64Exp( a );
2684 aSign = extractFloat64Sign( a );
2685 bSig = extractFloat64Frac( b );
2686 bExp = extractFloat64Exp( b );
2687 bSign = extractFloat64Sign( b );
2688 zSign = aSign ^ bSign;
2689 if ( aExp == 0x7FF ) {
2690 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2691 return propagateFloat64NaN( a, b );
2692 }
2693 if ( ( bExp | bSig ) == 0 ) {
2694 float_raise( float_flag_invalid );
2695 return float64_default_nan;
2696 }
2697 return packFloat64( zSign, 0x7FF, 0 );
2698 }
2699 if ( bExp == 0x7FF ) {
2700 if ( bSig ) return propagateFloat64NaN( a, b );
2701 if ( ( aExp | aSig ) == 0 ) {
2702 float_raise( float_flag_invalid );
2703 return float64_default_nan;
2704 }
2705 return packFloat64( zSign, 0x7FF, 0 );
2706 }
2707 if ( aExp == 0 ) {
2708 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2709 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2710 }
2711 if ( bExp == 0 ) {
2712 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
2713 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2714 }
2715 zExp = aExp + bExp - 0x3FF;
2716 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2717 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2718 mul64To128( aSig, bSig, &zSig0, &zSig1 );
2719 zSig0 |= ( zSig1 != 0 );
2720 if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
2721 zSig0 <<= 1;
2722 --zExp;
2723 }
2724 return roundAndPackFloat64( zSign, zExp, zSig0 );
2725
2726 }
2727
2728 /*----------------------------------------------------------------------------
2729 | Returns the result of dividing the double-precision floating-point value `a'
2730 | by the corresponding value `b'. The operation is performed according to
2731 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2732 *----------------------------------------------------------------------------*/
2733
2734 float64 float64_div( float64 a, float64 b )
2735 {
2736 flag aSign, bSign, zSign;
2737 int16 aExp, bExp, zExp;
2738 bits64 aSig, bSig, zSig;
2739 bits64 rem0, rem1;
2740 bits64 term0, term1;
2741
2742 aSig = extractFloat64Frac( a );
2743 aExp = extractFloat64Exp( a );
2744 aSign = extractFloat64Sign( a );
2745 bSig = extractFloat64Frac( b );
2746 bExp = extractFloat64Exp( b );
2747 bSign = extractFloat64Sign( b );
2748 zSign = aSign ^ bSign;
2749 if ( aExp == 0x7FF ) {
2750 if ( aSig ) return propagateFloat64NaN( a, b );
2751 if ( bExp == 0x7FF ) {
2752 if ( bSig ) return propagateFloat64NaN( a, b );
2753 float_raise( float_flag_invalid );
2754 return float64_default_nan;
2755 }
2756 return packFloat64( zSign, 0x7FF, 0 );
2757 }
2758 if ( bExp == 0x7FF ) {
2759 if ( bSig ) return propagateFloat64NaN( a, b );
2760 return packFloat64( zSign, 0, 0 );
2761 }
2762 if ( bExp == 0 ) {
2763 if ( bSig == 0 ) {
2764 if ( ( aExp | aSig ) == 0 ) {
2765 float_raise( float_flag_invalid );
2766 return float64_default_nan;
2767 }
2768 float_raise( float_flag_divbyzero );
2769 return packFloat64( zSign, 0x7FF, 0 );
2770 }
2771 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2772 }
2773 if ( aExp == 0 ) {
2774 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
2775 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2776 }
2777 zExp = aExp - bExp + 0x3FD;
2778 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
2779 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2780 if ( bSig <= ( aSig + aSig ) ) {
2781 aSig >>= 1;
2782 ++zExp;
2783 }
2784 zSig = estimateDiv128To64( aSig, 0, bSig );
2785 if ( ( zSig & 0x1FF ) <= 2 ) {
2786 mul64To128( bSig, zSig, &term0, &term1 );
2787 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2788 while ( (sbits64) rem0 < 0 ) {
2789 --zSig;
2790 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
2791 }
2792 zSig |= ( rem1 != 0 );
2793 }
2794 return roundAndPackFloat64( zSign, zExp, zSig );
2795
2796 }
2797
2798 /*----------------------------------------------------------------------------
2799 | Returns the remainder of the double-precision floating-point value `a'
2800 | with respect to the corresponding value `b'. The operation is performed
2801 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2802 *----------------------------------------------------------------------------*/
2803
2804 float64 float64_rem( float64 a, float64 b )
2805 {
2806 flag aSign, bSign, zSign;
2807 int16 aExp, bExp, expDiff;
2808 bits64 aSig, bSig;
2809 bits64 q, alternateASig;
2810 sbits64 sigMean;
2811
2812 aSig = extractFloat64Frac( a );
2813 aExp = extractFloat64Exp( a );
2814 aSign = extractFloat64Sign( a );
2815 bSig = extractFloat64Frac( b );
2816 bExp = extractFloat64Exp( b );
2817 bSign = extractFloat64Sign( b );
2818 if ( aExp == 0x7FF ) {
2819 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
2820 return propagateFloat64NaN( a, b );
2821 }
2822 float_raise( float_flag_invalid );
2823 return float64_default_nan;
2824 }
2825 if ( bExp == 0x7FF ) {
2826 if ( bSig ) return propagateFloat64NaN( a, b );
2827 return a;
2828 }
2829 if ( bExp == 0 ) {
2830 if ( bSig == 0 ) {
2831 float_raise( float_flag_invalid );
2832 return float64_default_nan;
2833 }
2834 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
2835 }
2836 if ( aExp == 0 ) {
2837 if ( aSig == 0 ) return a;
2838 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2839 }
2840 expDiff = aExp - bExp;
2841 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
2842 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
2843 if ( expDiff < 0 ) {
2844 if ( expDiff < -1 ) return a;
2845 aSig >>= 1;
2846 }
2847 q = ( bSig <= aSig );
2848 if ( q ) aSig -= bSig;
2849 expDiff -= 64;
2850 while ( 0 < expDiff ) {
2851 q = estimateDiv128To64( aSig, 0, bSig );
2852 q = ( 2 < q ) ? q - 2 : 0;
2853 aSig = - ( ( bSig>>2 ) * q );
2854 expDiff -= 62;
2855 }
2856 expDiff += 64;
2857 if ( 0 < expDiff ) {
2858 q = estimateDiv128To64( aSig, 0, bSig );
2859 q = ( 2 < q ) ? q - 2 : 0;
2860 q >>= 64 - expDiff;
2861 bSig >>= 2;
2862 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2863 }
2864 else {
2865 aSig >>= 2;
2866 bSig >>= 2;
2867 }
2868 do {
2869 alternateASig = aSig;
2870 ++q;
2871 aSig -= bSig;
2872 } while ( 0 <= (sbits64) aSig );
2873 sigMean = aSig + alternateASig;
2874 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2875 aSig = alternateASig;
2876 }
2877 zSign = ( (sbits64) aSig < 0 );
2878 if ( zSign ) aSig = - aSig;
2879 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig );
2880
2881 }
2882
2883 /*----------------------------------------------------------------------------
2884 | Returns the square root of the double-precision floating-point value `a'.
2885 | The operation is performed according to the IEC/IEEE Standard for Binary
2886 | Floating-Point Arithmetic.
2887 *----------------------------------------------------------------------------*/
2888
2889 float64 float64_sqrt( float64 a )
2890 {
2891 flag aSign;
2892 int16 aExp, zExp;
2893 bits64 aSig, zSig, doubleZSig;
2894 bits64 rem0, rem1, term0, term1;
2895 float64 z;
2896
2897 aSig = extractFloat64Frac( a );
2898 aExp = extractFloat64Exp( a );
2899 aSign = extractFloat64Sign( a );
2900 if ( aExp == 0x7FF ) {
2901 if ( aSig ) return propagateFloat64NaN( a, a );
2902 if ( ! aSign ) return a;
2903 float_raise( float_flag_invalid );
2904 return float64_default_nan;
2905 }
2906 if ( aSign ) {
2907 if ( ( aExp | aSig ) == 0 ) return a;
2908 float_raise( float_flag_invalid );
2909 return float64_default_nan;
2910 }
2911 if ( aExp == 0 ) {
2912 if ( aSig == 0 ) return 0;
2913 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
2914 }
2915 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
2916 aSig |= LIT64( 0x0010000000000000 );
2917 zSig = estimateSqrt32( aExp, aSig>>21 );
2918 aSig <<= 9 - ( aExp & 1 );
2919 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
2920 if ( ( zSig & 0x1FF ) <= 5 ) {
2921 doubleZSig = zSig<<1;
2922 mul64To128( zSig, zSig, &term0, &term1 );
2923 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
2924 while ( (sbits64) rem0 < 0 ) {
2925 --zSig;
2926 doubleZSig -= 2;
2927 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
2928 }
2929 zSig |= ( ( rem0 | rem1 ) != 0 );
2930 }
2931 return roundAndPackFloat64( 0, zExp, zSig );
2932
2933 }
2934
2935 /*----------------------------------------------------------------------------
2936 | Returns 1 if the double-precision floating-point value `a' is equal to the
2937 | corresponding value `b', and 0 otherwise. The comparison is performed
2938 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2939 *----------------------------------------------------------------------------*/
2940
2941 flag float64_eq( float64 a, float64 b )
2942 {
2943
2944 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2945 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2946 ) {
2947 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
2948 float_raise( float_flag_invalid );
2949 }
2950 return 0;
2951 }
2952 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
2953
2954 }
2955
2956 /*----------------------------------------------------------------------------
2957 | Returns 1 if the double-precision floating-point value `a' is less than or
2958 | equal to the corresponding value `b', and 0 otherwise. The comparison is
2959 | performed according to the IEC/IEEE Standard for Binary Floating-Point
2960 | Arithmetic.
2961 *----------------------------------------------------------------------------*/
2962
2963 flag float64_le( float64 a, float64 b )
2964 {
2965 flag aSign, bSign;
2966
2967 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2968 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2969 ) {
2970 float_raise( float_flag_invalid );
2971 return 0;
2972 }
2973 aSign = extractFloat64Sign( a );
2974 bSign = extractFloat64Sign( b );
2975 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
2976 return ( a == b ) || ( aSign ^ ( a < b ) );
2977
2978 }
2979
2980 /*----------------------------------------------------------------------------
2981 | Returns 1 if the double-precision floating-point value `a' is less than
2982 | the corresponding value `b', and 0 otherwise. The comparison is performed
2983 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2984 *----------------------------------------------------------------------------*/
2985
2986 flag float64_lt( float64 a, float64 b )
2987 {
2988 flag aSign, bSign;
2989
2990 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
2991 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
2992 ) {
2993 float_raise( float_flag_invalid );
2994 return 0;
2995 }
2996 aSign = extractFloat64Sign( a );
2997 bSign = extractFloat64Sign( b );
2998 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
2999 return ( a != b ) && ( aSign ^ ( a < b ) );
3000
3001 }
3002
3003 /*----------------------------------------------------------------------------
3004 | Returns 1 if the double-precision floating-point value `a' is equal to the
3005 | corresponding value `b', and 0 otherwise. The invalid exception is raised
3006 | if either operand is a NaN. Otherwise, the comparison is performed
3007 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3008 *----------------------------------------------------------------------------*/
3009
3010 flag float64_eq_signaling( float64 a, float64 b )
3011 {
3012
3013 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3014 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3015 ) {
3016 float_raise( float_flag_invalid );
3017 return 0;
3018 }
3019 return ( a == b ) || ( (bits64) ( ( a | b )<<1 ) == 0 );
3020
3021 }
3022
3023 /*----------------------------------------------------------------------------
3024 | Returns 1 if the double-precision floating-point value `a' is less than or
3025 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
3026 | cause an exception. Otherwise, the comparison is performed according to the
3027 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3028 *----------------------------------------------------------------------------*/
3029
3030 flag float64_le_quiet( float64 a, float64 b )
3031 {
3032 flag aSign, bSign;
3033 int16 aExp, bExp;
3034
3035 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3036 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3037 ) {
3038 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3039 float_raise( float_flag_invalid );
3040 }
3041 return 0;
3042 }
3043 aSign = extractFloat64Sign( a );
3044 bSign = extractFloat64Sign( b );
3045 if ( aSign != bSign ) return aSign || ( (bits64) ( ( a | b )<<1 ) == 0 );
3046 return ( a == b ) || ( aSign ^ ( a < b ) );
3047
3048 }
3049
3050 /*----------------------------------------------------------------------------
3051 | Returns 1 if the double-precision floating-point value `a' is less than
3052 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3053 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3054 | Standard for Binary Floating-Point Arithmetic.
3055 *----------------------------------------------------------------------------*/
3056
3057 flag float64_lt_quiet( float64 a, float64 b )
3058 {
3059 flag aSign, bSign;
3060
3061 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
3062 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
3063 ) {
3064 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
3065 float_raise( float_flag_invalid );
3066 }
3067 return 0;
3068 }
3069 aSign = extractFloat64Sign( a );
3070 bSign = extractFloat64Sign( b );
3071 if ( aSign != bSign ) return aSign && ( (bits64) ( ( a | b )<<1 ) != 0 );
3072 return ( a != b ) && ( aSign ^ ( a < b ) );
3073
3074 }
3075
3076 #ifdef FLOATX80
3077
3078 /*----------------------------------------------------------------------------
3079 | Returns the result of converting the extended double-precision floating-
3080 | point value `a' to the 32-bit two's complement integer format. The
3081 | conversion is performed according to the IEC/IEEE Standard for Binary
3082 | Floating-Point Arithmetic---which means in particular that the conversion
3083 | is rounded according to the current rounding mode. If `a' is a NaN, the
3084 | largest positive integer is returned. Otherwise, if the conversion
3085 | overflows, the largest integer with the same sign as `a' is returned.
3086 *----------------------------------------------------------------------------*/
3087
3088 int32 floatx80_to_int32( floatx80 a )
3089 {
3090 flag aSign;
3091 int32 aExp, shiftCount;
3092 bits64 aSig;
3093
3094 aSig = extractFloatx80Frac( a );
3095 aExp = extractFloatx80Exp( a );
3096 aSign = extractFloatx80Sign( a );
3097 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3098 shiftCount = 0x4037 - aExp;
3099 if ( shiftCount <= 0 ) shiftCount = 1;
3100 shift64RightJamming( aSig, shiftCount, &aSig );
3101 return roundAndPackInt32( aSign, aSig );
3102
3103 }
3104
3105 /*----------------------------------------------------------------------------
3106 | Returns the result of converting the extended double-precision floating-
3107 | point value `a' to the 32-bit two's complement integer format. The
3108 | conversion is performed according to the IEC/IEEE Standard for Binary
3109 | Floating-Point Arithmetic, except that the conversion is always rounded
3110 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3111 | Otherwise, if the conversion overflows, the largest integer with the same
3112 | sign as `a' is returned.
3113 *----------------------------------------------------------------------------*/
3114
3115 int32 floatx80_to_int32_round_to_zero( floatx80 a )
3116 {
3117 flag aSign;
3118 int32 aExp, shiftCount;
3119 bits64 aSig, savedASig;
3120 int32 z;
3121
3122 aSig = extractFloatx80Frac( a );
3123 aExp = extractFloatx80Exp( a );
3124 aSign = extractFloatx80Sign( a );
3125 if ( 0x401E < aExp ) {
3126 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
3127 goto invalid;
3128 }
3129 else if ( aExp < 0x3FFF ) {
3130 if ( aExp || aSig ) float_exception_flags |= float_flag_inexact;
3131 return 0;
3132 }
3133 shiftCount = 0x403E - aExp;
3134 savedASig = aSig;
3135 aSig >>= shiftCount;
3136 z = aSig;
3137 if ( aSign ) z = - z;
3138 if ( ( z < 0 ) ^ aSign ) {
3139 invalid:
3140 float_raise( float_flag_invalid );
3141 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
3142 }
3143 if ( ( aSig<<shiftCount ) != savedASig ) {
3144 float_exception_flags |= float_flag_inexact;
3145 }
3146 return z;
3147
3148 }
3149
3150 /*----------------------------------------------------------------------------
3151 | Returns the result of converting the extended double-precision floating-
3152 | point value `a' to the 64-bit two's complement integer format. The
3153 | conversion is performed according to the IEC/IEEE Standard for Binary
3154 | Floating-Point Arithmetic---which means in particular that the conversion
3155 | is rounded according to the current rounding mode. If `a' is a NaN,
3156 | the largest positive integer is returned. Otherwise, if the conversion
3157 | overflows, the largest integer with the same sign as `a' is returned.
3158 *----------------------------------------------------------------------------*/
3159
3160 int64 floatx80_to_int64( floatx80 a )
3161 {
3162 flag aSign;
3163 int32 aExp, shiftCount;
3164 bits64 aSig, aSigExtra;
3165
3166 aSig = extractFloatx80Frac( a );
3167 aExp = extractFloatx80Exp( a );
3168 aSign = extractFloatx80Sign( a );
3169 shiftCount = 0x403E - aExp;
3170 if ( shiftCount <= 0 ) {
3171 if ( shiftCount ) {
3172 float_raise( float_flag_invalid );
3173 if ( ! aSign
3174 || ( ( aExp == 0x7FFF )
3175 && ( aSig != LIT64( 0x8000000000000000 ) ) )
3176 ) {
3177 return LIT64( 0x7FFFFFFFFFFFFFFF );
3178 }
3179 return (sbits64) LIT64( 0x8000000000000000 );
3180 }
3181 aSigExtra = 0;
3182 }
3183 else {
3184 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3185 }
3186 return roundAndPackInt64( aSign, aSig, aSigExtra );
3187
3188 }
3189
3190 /*----------------------------------------------------------------------------
3191 | Returns the result of converting the extended double-precision floating-
3192 | point value `a' to the 64-bit two's complement integer format. The
3193 | conversion is performed according to the IEC/IEEE Standard for Binary
3194 | Floating-Point Arithmetic, except that the conversion is always rounded
3195 | toward zero. If `a' is a NaN, the largest positive integer is returned.
3196 | Otherwise, if the conversion overflows, the largest integer with the same
3197 | sign as `a' is returned.
3198 *----------------------------------------------------------------------------*/
3199
3200 int64 floatx80_to_int64_round_to_zero( floatx80 a )
3201 {
3202 flag aSign;
3203 int32 aExp, shiftCount;
3204 bits64 aSig;
3205 int64 z;
3206
3207 aSig = extractFloatx80Frac( a );
3208 aExp = extractFloatx80Exp( a );
3209 aSign = extractFloatx80Sign( a );
3210 shiftCount = aExp - 0x403E;
3211 if ( 0 <= shiftCount ) {
3212 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
3213 if ( ( a.high != 0xC03E ) || aSig ) {
3214 float_raise( float_flag_invalid );
3215 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
3216 return LIT64( 0x7FFFFFFFFFFFFFFF );
3217 }
3218 }
3219 return (sbits64) LIT64( 0x8000000000000000 );
3220 }
3221 else if ( aExp < 0x3FFF ) {
3222 if ( aExp | aSig ) float_exception_flags |= float_flag_inexact;
3223 return 0;
3224 }
3225 z = aSig>>( - shiftCount );
3226 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
3227 float_exception_flags |= float_flag_inexact;
3228 }
3229 if ( aSign ) z = - z;
3230 return z;
3231
3232 }
3233
3234 /*----------------------------------------------------------------------------
3235 | Returns the result of converting the extended double-precision floating-
3236 | point value `a' to the single-precision floating-point format. The
3237 | conversion is performed according to the IEC/IEEE Standard for Binary
3238 | Floating-Point Arithmetic.
3239 *----------------------------------------------------------------------------*/
3240
3241 float32 floatx80_to_float32( floatx80 a )
3242 {
3243 flag aSign;
3244 int32 aExp;
3245 bits64 aSig;
3246
3247 aSig = extractFloatx80Frac( a );
3248 aExp = extractFloatx80Exp( a );
3249 aSign = extractFloatx80Sign( a );
3250 if ( aExp == 0x7FFF ) {
3251 if ( (bits64) ( aSig<<1 ) ) {
3252 return commonNaNToFloat32( floatx80ToCommonNaN( a ) );
3253 }
3254 return packFloat32( aSign, 0xFF, 0 );
3255 }
3256 shift64RightJamming( aSig, 33, &aSig );
3257 if ( aExp || aSig ) aExp -= 0x3F81;
3258 return roundAndPackFloat32( aSign, aExp, aSig );
3259
3260 }
3261
3262 /*----------------------------------------------------------------------------
3263 | Returns the result of converting the extended double-precision floating-
3264 | point value `a' to the double-precision floating-point format. The
3265 | conversion is performed according to the IEC/IEEE Standard for Binary
3266 | Floating-Point Arithmetic.
3267 *----------------------------------------------------------------------------*/
3268
3269 float64 floatx80_to_float64( floatx80 a )
3270 {
3271 flag aSign;
3272 int32 aExp;
3273 bits64 aSig, zSig;
3274
3275 aSig = extractFloatx80Frac( a );
3276 aExp = extractFloatx80Exp( a );
3277 aSign = extractFloatx80Sign( a );
3278 if ( aExp == 0x7FFF ) {
3279 if ( (bits64) ( aSig<<1 ) ) {
3280 return commonNaNToFloat64( floatx80ToCommonNaN( a ) );
3281 }
3282 return packFloat64( aSign, 0x7FF, 0 );
3283 }
3284 shift64RightJamming( aSig, 1, &zSig );
3285 if ( aExp || aSig ) aExp -= 0x3C01;
3286 return roundAndPackFloat64( aSign, aExp, zSig );
3287
3288 }
3289
3290 #ifdef FLOAT128
3291
3292 /*----------------------------------------------------------------------------
3293 | Returns the result of converting the extended double-precision floating-
3294 | point value `a' to the quadruple-precision floating-point format. The
3295 | conversion is performed according to the IEC/IEEE Standard for Binary
3296 | Floating-Point Arithmetic.
3297 *----------------------------------------------------------------------------*/
3298
3299 float128 floatx80_to_float128( floatx80 a )
3300 {
3301 flag aSign;
3302 int16 aExp;
3303 bits64 aSig, zSig0, zSig1;
3304
3305 aSig = extractFloatx80Frac( a );
3306 aExp = extractFloatx80Exp( a );
3307 aSign = extractFloatx80Sign( a );
3308 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
3309 return commonNaNToFloat128( floatx80ToCommonNaN( a ) );
3310 }
3311 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
3312 return packFloat128( aSign, aExp, zSig0, zSig1 );
3313
3314 }
3315
3316 #endif
3317
3318 /*----------------------------------------------------------------------------
3319 | Rounds the extended double-precision floating-point value `a' to an integer,
3320 | and returns the result as an extended quadruple-precision floating-point
3321 | value. The operation is performed according to the IEC/IEEE Standard for
3322 | Binary Floating-Point Arithmetic.
3323 *----------------------------------------------------------------------------*/
3324
3325 floatx80 floatx80_round_to_int( floatx80 a )
3326 {
3327 flag aSign;
3328 int32 aExp;
3329 bits64 lastBitMask, roundBitsMask;
3330 int8 roundingMode;
3331 floatx80 z;
3332
3333 aExp = extractFloatx80Exp( a );
3334 if ( 0x403E <= aExp ) {
3335 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
3336 return propagateFloatx80NaN( a, a );
3337 }
3338 return a;
3339 }
3340 if ( aExp < 0x3FFF ) {
3341 if ( ( aExp == 0 )
3342 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
3343 return a;
3344 }
3345 float_exception_flags |= float_flag_inexact;
3346 aSign = extractFloatx80Sign( a );
3347 switch ( float_rounding_mode ) {
3348 case float_round_nearest_even:
3349 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
3350 ) {
3351 return
3352 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
3353 }
3354 break;
3355 case float_round_down:
3356 return
3357 aSign ?
3358 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
3359 : packFloatx80( 0, 0, 0 );
3360 case float_round_up:
3361 return
3362 aSign ? packFloatx80( 1, 0, 0 )
3363 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
3364 }
3365 return packFloatx80( aSign, 0, 0 );
3366 }
3367 lastBitMask = 1;
3368 lastBitMask <<= 0x403E - aExp;
3369 roundBitsMask = lastBitMask - 1;
3370 z = a;
3371 roundingMode = float_rounding_mode;
3372 if ( roundingMode == float_round_nearest_even ) {
3373 z.low += lastBitMask>>1;
3374 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
3375 }
3376 else if ( roundingMode != float_round_to_zero ) {
3377 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
3378 z.low += roundBitsMask;
3379 }
3380 }
3381 z.low &= ~ roundBitsMask;
3382 if ( z.low == 0 ) {
3383 ++z.high;
3384 z.low = LIT64( 0x8000000000000000 );
3385 }
3386 if ( z.low != a.low ) float_exception_flags |= float_flag_inexact;
3387 return z;
3388
3389 }
3390
3391 /*----------------------------------------------------------------------------
3392 | Returns the result of adding the absolute values of the extended double-
3393 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
3394 | negated before being returned. `zSign' is ignored if the result is a NaN.
3395 | The addition is performed according to the IEC/IEEE Standard for Binary
3396 | Floating-Point Arithmetic.
3397 *----------------------------------------------------------------------------*/
3398
3399 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3400 {
3401 int32 aExp, bExp, zExp;
3402 bits64 aSig, bSig, zSig0, zSig1;
3403 int32 expDiff;
3404
3405 aSig = extractFloatx80Frac( a );
3406 aExp = extractFloatx80Exp( a );
3407 bSig = extractFloatx80Frac( b );
3408 bExp = extractFloatx80Exp( b );
3409 expDiff = aExp - bExp;
3410 if ( 0 < expDiff ) {
3411 if ( aExp == 0x7FFF ) {
3412 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3413 return a;
3414 }
3415 if ( bExp == 0 ) --expDiff;
3416 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3417 zExp = aExp;
3418 }
3419 else if ( expDiff < 0 ) {
3420 if ( bExp == 0x7FFF ) {
3421 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3422 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3423 }
3424 if ( aExp == 0 ) ++expDiff;
3425 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3426 zExp = bExp;
3427 }
3428 else {
3429 if ( aExp == 0x7FFF ) {
3430 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3431 return propagateFloatx80NaN( a, b );
3432 }
3433 return a;
3434 }
3435 zSig1 = 0;
3436 zSig0 = aSig + bSig;
3437 if ( aExp == 0 ) {
3438 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
3439 goto roundAndPack;
3440 }
3441 zExp = aExp;
3442 goto shiftRight1;
3443 }
3444 zSig0 = aSig + bSig;
3445 if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
3446 shiftRight1:
3447 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
3448 zSig0 |= LIT64( 0x8000000000000000 );
3449 ++zExp;
3450 roundAndPack:
3451 return
3452 roundAndPackFloatx80(
3453 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3454
3455 }
3456
3457 /*----------------------------------------------------------------------------
3458 | Returns the result of subtracting the absolute values of the extended
3459 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
3460 | difference is negated before being returned. `zSign' is ignored if the
3461 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3462 | Standard for Binary Floating-Point Arithmetic.
3463 *----------------------------------------------------------------------------*/
3464
3465 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign )
3466 {
3467 int32 aExp, bExp, zExp;
3468 bits64 aSig, bSig, zSig0, zSig1;
3469 int32 expDiff;
3470 floatx80 z;
3471
3472 aSig = extractFloatx80Frac( a );
3473 aExp = extractFloatx80Exp( a );
3474 bSig = extractFloatx80Frac( b );
3475 bExp = extractFloatx80Exp( b );
3476 expDiff = aExp - bExp;
3477 if ( 0 < expDiff ) goto aExpBigger;
3478 if ( expDiff < 0 ) goto bExpBigger;
3479 if ( aExp == 0x7FFF ) {
3480 if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
3481 return propagateFloatx80NaN( a, b );
3482 }
3483 float_raise( float_flag_invalid );
3484 z.low = floatx80_default_nan_low;
3485 z.high = floatx80_default_nan_high;
3486 return z;
3487 }
3488 if ( aExp == 0 ) {
3489 aExp = 1;
3490 bExp = 1;
3491 }
3492 zSig1 = 0;
3493 if ( bSig < aSig ) goto aBigger;
3494 if ( aSig < bSig ) goto bBigger;
3495 return packFloatx80( float_rounding_mode == float_round_down, 0, 0 );
3496 bExpBigger:
3497 if ( bExp == 0x7FFF ) {
3498 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3499 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
3500 }
3501 if ( aExp == 0 ) ++expDiff;
3502 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
3503 bBigger:
3504 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
3505 zExp = bExp;
3506 zSign ^= 1;
3507 goto normalizeRoundAndPack;
3508 aExpBigger:
3509 if ( aExp == 0x7FFF ) {
3510 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3511 return a;
3512 }
3513 if ( bExp == 0 ) --expDiff;
3514 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
3515 aBigger:
3516 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
3517 zExp = aExp;
3518 normalizeRoundAndPack:
3519 return
3520 normalizeRoundAndPackFloatx80(
3521 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3522
3523 }
3524
3525 /*----------------------------------------------------------------------------
3526 | Returns the result of adding the extended double-precision floating-point
3527 | values `a' and `b'. The operation is performed according to the IEC/IEEE
3528 | Standard for Binary Floating-Point Arithmetic.
3529 *----------------------------------------------------------------------------*/
3530
3531 floatx80 floatx80_add( floatx80 a, floatx80 b )
3532 {
3533 flag aSign, bSign;
3534
3535 aSign = extractFloatx80Sign( a );
3536 bSign = extractFloatx80Sign( b );
3537 if ( aSign == bSign ) {
3538 return addFloatx80Sigs( a, b, aSign );
3539 }
3540 else {
3541 return subFloatx80Sigs( a, b, aSign );
3542 }
3543
3544 }
3545
3546 /*----------------------------------------------------------------------------
3547 | Returns the result of subtracting the extended double-precision floating-
3548 | point values `a' and `b'. The operation is performed according to the
3549 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3550 *----------------------------------------------------------------------------*/
3551
3552 floatx80 floatx80_sub( floatx80 a, floatx80 b )
3553 {
3554 flag aSign, bSign;
3555
3556 aSign = extractFloatx80Sign( a );
3557 bSign = extractFloatx80Sign( b );
3558 if ( aSign == bSign ) {
3559 return subFloatx80Sigs( a, b, aSign );
3560 }
3561 else {
3562 return addFloatx80Sigs( a, b, aSign );
3563 }
3564
3565 }
3566
3567 /*----------------------------------------------------------------------------
3568 | Returns the result of multiplying the extended double-precision floating-
3569 | point values `a' and `b'. The operation is performed according to the
3570 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3571 *----------------------------------------------------------------------------*/
3572
3573 floatx80 floatx80_mul( floatx80 a, floatx80 b )
3574 {
3575 flag aSign, bSign, zSign;
3576 int32 aExp, bExp, zExp;
3577 bits64 aSig, bSig, zSig0, zSig1;
3578 floatx80 z;
3579
3580 aSig = extractFloatx80Frac( a );
3581 aExp = extractFloatx80Exp( a );
3582 aSign = extractFloatx80Sign( a );
3583 bSig = extractFloatx80Frac( b );
3584 bExp = extractFloatx80Exp( b );
3585 bSign = extractFloatx80Sign( b );
3586 zSign = aSign ^ bSign;
3587 if ( aExp == 0x7FFF ) {
3588 if ( (bits64) ( aSig<<1 )
3589 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3590 return propagateFloatx80NaN( a, b );
3591 }
3592 if ( ( bExp | bSig ) == 0 ) goto invalid;
3593 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3594 }
3595 if ( bExp == 0x7FFF ) {
3596 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3597 if ( ( aExp | aSig ) == 0 ) {
3598 invalid:
3599 float_raise( float_flag_invalid );
3600 z.low = floatx80_default_nan_low;
3601 z.high = floatx80_default_nan_high;
3602 return z;
3603 }
3604 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3605 }
3606 if ( aExp == 0 ) {
3607 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3608 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3609 }
3610 if ( bExp == 0 ) {
3611 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
3612 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3613 }
3614 zExp = aExp + bExp - 0x3FFE;
3615 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3616 if ( 0 < (sbits64) zSig0 ) {
3617 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
3618 --zExp;
3619 }
3620 return
3621 roundAndPackFloatx80(
3622 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3623
3624 }
3625
3626 /*----------------------------------------------------------------------------
3627 | Returns the result of dividing the extended double-precision floating-point
3628 | value `a' by the corresponding value `b'. The operation is performed
3629 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3630 *----------------------------------------------------------------------------*/
3631
3632 floatx80 floatx80_div( floatx80 a, floatx80 b )
3633 {
3634 flag aSign, bSign, zSign;
3635 int32 aExp, bExp, zExp;
3636 bits64 aSig, bSig, zSig0, zSig1;
3637 bits64 rem0, rem1, rem2, term0, term1, term2;
3638 floatx80 z;
3639
3640 aSig = extractFloatx80Frac( a );
3641 aExp = extractFloatx80Exp( a );
3642 aSign = extractFloatx80Sign( a );
3643 bSig = extractFloatx80Frac( b );
3644 bExp = extractFloatx80Exp( b );
3645 bSign = extractFloatx80Sign( b );
3646 zSign = aSign ^ bSign;
3647 if ( aExp == 0x7FFF ) {
3648 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b );
3649 if ( bExp == 0x7FFF ) {
3650 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3651 goto invalid;
3652 }
3653 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3654 }
3655 if ( bExp == 0x7FFF ) {
3656 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3657 return packFloatx80( zSign, 0, 0 );
3658 }
3659 if ( bExp == 0 ) {
3660 if ( bSig == 0 ) {
3661 if ( ( aExp | aSig ) == 0 ) {
3662 invalid:
3663 float_raise( float_flag_invalid );
3664 z.low = floatx80_default_nan_low;
3665 z.high = floatx80_default_nan_high;
3666 return z;
3667 }
3668 float_raise( float_flag_divbyzero );
3669 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3670 }
3671 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3672 }
3673 if ( aExp == 0 ) {
3674 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
3675 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
3676 }
3677 zExp = aExp - bExp + 0x3FFE;
3678 rem1 = 0;
3679 if ( bSig <= aSig ) {
3680 shift128Right( aSig, 0, 1, &aSig, &rem1 );
3681 ++zExp;
3682 }
3683 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
3684 mul64To128( bSig, zSig0, &term0, &term1 );
3685 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
3686 while ( (sbits64) rem0 < 0 ) {
3687 --zSig0;
3688 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
3689 }
3690 zSig1 = estimateDiv128To64( rem1, 0, bSig );
3691 if ( (bits64) ( zSig1<<1 ) <= 8 ) {
3692 mul64To128( bSig, zSig1, &term1, &term2 );
3693 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3694 while ( (sbits64) rem1 < 0 ) {
3695 --zSig1;
3696 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
3697 }
3698 zSig1 |= ( ( rem1 | rem2 ) != 0 );
3699 }
3700 return
3701 roundAndPackFloatx80(
3702 floatx80_rounding_precision, zSign, zExp, zSig0, zSig1 );
3703
3704 }
3705
3706 /*----------------------------------------------------------------------------
3707 | Returns the remainder of the extended double-precision floating-point value
3708 | `a' with respect to the corresponding value `b'. The operation is performed
3709 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3710 *----------------------------------------------------------------------------*/
3711
3712 floatx80 floatx80_rem( floatx80 a, floatx80 b )
3713 {
3714 flag aSign, bSign, zSign;
3715 int32 aExp, bExp, expDiff;
3716 bits64 aSig0, aSig1, bSig;
3717 bits64 q, term0, term1, alternateASig0, alternateASig1;
3718 floatx80 z;
3719
3720 aSig0 = extractFloatx80Frac( a );
3721 aExp = extractFloatx80Exp( a );
3722 aSign = extractFloatx80Sign( a );
3723 bSig = extractFloatx80Frac( b );
3724 bExp = extractFloatx80Exp( b );
3725 bSign = extractFloatx80Sign( b );
3726 if ( aExp == 0x7FFF ) {
3727 if ( (bits64) ( aSig0<<1 )
3728 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
3729 return propagateFloatx80NaN( a, b );
3730 }
3731 goto invalid;
3732 }
3733 if ( bExp == 0x7FFF ) {
3734 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b );
3735 return a;
3736 }
3737 if ( bExp == 0 ) {
3738 if ( bSig == 0 ) {
3739 invalid:
3740 float_raise( float_flag_invalid );
3741 z.low = floatx80_default_nan_low;
3742 z.high = floatx80_default_nan_high;
3743 return z;
3744 }
3745 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
3746 }
3747 if ( aExp == 0 ) {
3748 if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
3749 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3750 }
3751 bSig |= LIT64( 0x8000000000000000 );
3752 zSign = aSign;
3753 expDiff = aExp - bExp;
3754 aSig1 = 0;
3755 if ( expDiff < 0 ) {
3756 if ( expDiff < -1 ) return a;
3757 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
3758 expDiff = 0;
3759 }
3760 q = ( bSig <= aSig0 );
3761 if ( q ) aSig0 -= bSig;
3762 expDiff -= 64;
3763 while ( 0 < expDiff ) {
3764 q = estimateDiv128To64( aSig0, aSig1, bSig );
3765 q = ( 2 < q ) ? q - 2 : 0;
3766 mul64To128( bSig, q, &term0, &term1 );
3767 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3768 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
3769 expDiff -= 62;
3770 }
3771 expDiff += 64;
3772 if ( 0 < expDiff ) {
3773 q = estimateDiv128To64( aSig0, aSig1, bSig );
3774 q = ( 2 < q ) ? q - 2 : 0;
3775 q >>= 64 - expDiff;
3776 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
3777 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3778 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
3779 while ( le128( term0, term1, aSig0, aSig1 ) ) {
3780 ++q;
3781 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
3782 }
3783 }
3784 else {
3785 term1 = 0;
3786 term0 = bSig;
3787 }
3788 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
3789 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
3790 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
3791 && ( q & 1 ) )
3792 ) {
3793 aSig0 = alternateASig0;
3794 aSig1 = alternateASig1;
3795 zSign = ! zSign;
3796 }
3797 return
3798 normalizeRoundAndPackFloatx80(
3799 80, zSign, bExp + expDiff, aSig0, aSig1 );
3800
3801 }
3802
3803 /*----------------------------------------------------------------------------
3804 | Returns the square root of the extended double-precision floating-point
3805 | value `a'. The operation is performed according to the IEC/IEEE Standard
3806 | for Binary Floating-Point Arithmetic.
3807 *----------------------------------------------------------------------------*/
3808
3809 floatx80 floatx80_sqrt( floatx80 a )
3810 {
3811 flag aSign;
3812 int32 aExp, zExp;
3813 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
3814 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
3815 floatx80 z;
3816
3817 aSig0 = extractFloatx80Frac( a );
3818 aExp = extractFloatx80Exp( a );
3819 aSign = extractFloatx80Sign( a );
3820 if ( aExp == 0x7FFF ) {
3821 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a );
3822 if ( ! aSign ) return a;
3823 goto invalid;
3824 }
3825 if ( aSign ) {
3826 if ( ( aExp | aSig0 ) == 0 ) return a;
3827 invalid:
3828 float_raise( float_flag_invalid );
3829 z.low = floatx80_default_nan_low;
3830 z.high = floatx80_default_nan_high;
3831 return z;
3832 }
3833 if ( aExp == 0 ) {
3834 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
3835 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
3836 }
3837 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
3838 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
3839 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
3840 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
3841 doubleZSig0 = zSig0<<1;
3842 mul64To128( zSig0, zSig0, &term0, &term1 );
3843 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
3844 while ( (sbits64) rem0 < 0 ) {
3845 --zSig0;
3846 doubleZSig0 -= 2;
3847 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
3848 }
3849 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
3850 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
3851 if ( zSig1 == 0 ) zSig1 = 1;
3852 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
3853 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
3854 mul64To128( zSig1, zSig1, &term2, &term3 );
3855 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
3856 while ( (sbits64) rem1 < 0 ) {
3857 --zSig1;
3858 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
3859 term3 |= 1;
3860 term2 |= doubleZSig0;
3861 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
3862 }
3863 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
3864 }
3865 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
3866 zSig0 |= doubleZSig0;
3867 return
3868 roundAndPackFloatx80(
3869 floatx80_rounding_precision, 0, zExp, zSig0, zSig1 );
3870
3871 }
3872
3873 /*----------------------------------------------------------------------------
3874 | Returns 1 if the extended double-precision floating-point value `a' is
3875 | equal to the corresponding value `b', and 0 otherwise. The comparison is
3876 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3877 | Arithmetic.
3878 *----------------------------------------------------------------------------*/
3879
3880 flag floatx80_eq( floatx80 a, floatx80 b )
3881 {
3882
3883 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3884 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3885 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3886 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3887 ) {
3888 if ( floatx80_is_signaling_nan( a )
3889 || floatx80_is_signaling_nan( b ) ) {
3890 float_raise( float_flag_invalid );
3891 }
3892 return 0;
3893 }
3894 return
3895 ( a.low == b.low )
3896 && ( ( a.high == b.high )
3897 || ( ( a.low == 0 )
3898 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3899 );
3900
3901 }
3902
3903 /*----------------------------------------------------------------------------
3904 | Returns 1 if the extended double-precision floating-point value `a' is
3905 | less than or equal to the corresponding value `b', and 0 otherwise. The
3906 | comparison is performed according to the IEC/IEEE Standard for Binary
3907 | Floating-Point Arithmetic.
3908 *----------------------------------------------------------------------------*/
3909
3910 flag floatx80_le( floatx80 a, floatx80 b )
3911 {
3912 flag aSign, bSign;
3913
3914 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3915 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3916 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3917 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3918 ) {
3919 float_raise( float_flag_invalid );
3920 return 0;
3921 }
3922 aSign = extractFloatx80Sign( a );
3923 bSign = extractFloatx80Sign( b );
3924 if ( aSign != bSign ) {
3925 return
3926 aSign
3927 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3928 == 0 );
3929 }
3930 return
3931 aSign ? le128( b.high, b.low, a.high, a.low )
3932 : le128( a.high, a.low, b.high, b.low );
3933
3934 }
3935
3936 /*----------------------------------------------------------------------------
3937 | Returns 1 if the extended double-precision floating-point value `a' is
3938 | less than the corresponding value `b', and 0 otherwise. The comparison
3939 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3940 | Arithmetic.
3941 *----------------------------------------------------------------------------*/
3942
3943 flag floatx80_lt( floatx80 a, floatx80 b )
3944 {
3945 flag aSign, bSign;
3946
3947 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3948 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3949 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3950 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3951 ) {
3952 float_raise( float_flag_invalid );
3953 return 0;
3954 }
3955 aSign = extractFloatx80Sign( a );
3956 bSign = extractFloatx80Sign( b );
3957 if ( aSign != bSign ) {
3958 return
3959 aSign
3960 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
3961 != 0 );
3962 }
3963 return
3964 aSign ? lt128( b.high, b.low, a.high, a.low )
3965 : lt128( a.high, a.low, b.high, b.low );
3966
3967 }
3968
3969 /*----------------------------------------------------------------------------
3970 | Returns 1 if the extended double-precision floating-point value `a' is equal
3971 | to the corresponding value `b', and 0 otherwise. The invalid exception is
3972 | raised if either operand is a NaN. Otherwise, the comparison is performed
3973 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3974 *----------------------------------------------------------------------------*/
3975
3976 flag floatx80_eq_signaling( floatx80 a, floatx80 b )
3977 {
3978
3979 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
3980 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
3981 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
3982 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
3983 ) {
3984 float_raise( float_flag_invalid );
3985 return 0;
3986 }
3987 return
3988 ( a.low == b.low )
3989 && ( ( a.high == b.high )
3990 || ( ( a.low == 0 )
3991 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
3992 );
3993
3994 }
3995
3996 /*----------------------------------------------------------------------------
3997 | Returns 1 if the extended double-precision floating-point value `a' is less
3998 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
3999 | do not cause an exception. Otherwise, the comparison is performed according
4000 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4001 *----------------------------------------------------------------------------*/
4002
4003 flag floatx80_le_quiet( floatx80 a, floatx80 b )
4004 {
4005 flag aSign, bSign;
4006
4007 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4008 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4009 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4010 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4011 ) {
4012 if ( floatx80_is_signaling_nan( a )
4013 || floatx80_is_signaling_nan( b ) ) {
4014 float_raise( float_flag_invalid );
4015 }
4016 return 0;
4017 }
4018 aSign = extractFloatx80Sign( a );
4019 bSign = extractFloatx80Sign( b );
4020 if ( aSign != bSign ) {
4021 return
4022 aSign
4023 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4024 == 0 );
4025 }
4026 return
4027 aSign ? le128( b.high, b.low, a.high, a.low )
4028 : le128( a.high, a.low, b.high, b.low );
4029
4030 }
4031
4032 /*----------------------------------------------------------------------------
4033 | Returns 1 if the extended double-precision floating-point value `a' is less
4034 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
4035 | an exception. Otherwise, the comparison is performed according to the
4036 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4037 *----------------------------------------------------------------------------*/
4038
4039 flag floatx80_lt_quiet( floatx80 a, floatx80 b )
4040 {
4041 flag aSign, bSign;
4042
4043 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
4044 && (bits64) ( extractFloatx80Frac( a )<<1 ) )
4045 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
4046 && (bits64) ( extractFloatx80Frac( b )<<1 ) )
4047 ) {
4048 if ( floatx80_is_signaling_nan( a )
4049 || floatx80_is_signaling_nan( b ) ) {
4050 float_raise( float_flag_invalid );
4051 }
4052 return 0;
4053 }
4054 aSign = extractFloatx80Sign( a );
4055 bSign = extractFloatx80Sign( b );
4056 if ( aSign != bSign ) {
4057 return
4058 aSign
4059 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
4060 != 0 );
4061 }
4062 return
4063 aSign ? lt128( b.high, b.low, a.high, a.low )
4064 : lt128( a.high, a.low, b.high, b.low );
4065
4066 }
4067
4068 #endif
4069
4070 #ifdef FLOAT128
4071
4072 /*----------------------------------------------------------------------------
4073 | Returns the result of converting the quadruple-precision floating-point
4074 | value `a' to the 32-bit two's complement integer format. The conversion
4075 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4076 | Arithmetic---which means in particular that the conversion is rounded
4077 | according to the current rounding mode. If `a' is a NaN, the largest
4078 | positive integer is returned. Otherwise, if the conversion overflows, the
4079 | largest integer with the same sign as `a' is returned.
4080 *----------------------------------------------------------------------------*/
4081
4082 int32 float128_to_int32( float128 a )
4083 {
4084 flag aSign;
4085 int32 aExp, shiftCount;
4086 bits64 aSig0, aSig1;
4087
4088 aSig1 = extractFloat128Frac1( a );
4089 aSig0 = extractFloat128Frac0( a );
4090 aExp = extractFloat128Exp( a );
4091 aSign = extractFloat128Sign( a );
4092 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
4093 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4094 aSig0 |= ( aSig1 != 0 );
4095 shiftCount = 0x4028 - aExp;
4096 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
4097 return roundAndPackInt32( aSign, aSig0 );
4098
4099 }
4100
4101 /*----------------------------------------------------------------------------
4102 | Returns the result of converting the quadruple-precision floating-point
4103 | value `a' to the 32-bit two's complement integer format. The conversion
4104 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4105 | Arithmetic, except that the conversion is always rounded toward zero. If
4106 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
4107 | conversion overflows, the largest integer with the same sign as `a' is
4108 | returned.
4109 *----------------------------------------------------------------------------*/
4110
4111 int32 float128_to_int32_round_to_zero( float128 a )
4112 {
4113 flag aSign;
4114 int32 aExp, shiftCount;
4115 bits64 aSig0, aSig1, savedASig;
4116 int32 z;
4117
4118 aSig1 = extractFloat128Frac1( a );
4119 aSig0 = extractFloat128Frac0( a );
4120 aExp = extractFloat128Exp( a );
4121 aSign = extractFloat128Sign( a );
4122 aSig0 |= ( aSig1 != 0 );
4123 if ( 0x401E < aExp ) {
4124 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
4125 goto invalid;
4126 }
4127 else if ( aExp < 0x3FFF ) {
4128 if ( aExp || aSig0 ) float_exception_flags |= float_flag_inexact;
4129 return 0;
4130 }
4131 aSig0 |= LIT64( 0x0001000000000000 );
4132 shiftCount = 0x402F - aExp;
4133 savedASig = aSig0;
4134 aSig0 >>= shiftCount;
4135 z = aSig0;
4136 if ( aSign ) z = - z;
4137 if ( ( z < 0 ) ^ aSign ) {
4138 invalid:
4139 float_raise( float_flag_invalid );
4140 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
4141 }
4142 if ( ( aSig0<<shiftCount ) != savedASig ) {
4143 float_exception_flags |= float_flag_inexact;
4144 }
4145 return z;
4146
4147 }
4148
4149 /*----------------------------------------------------------------------------
4150 | Returns the result of converting the quadruple-precision floating-point
4151 | value `a' to the 64-bit two's complement integer format. The conversion
4152 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4153 | Arithmetic---which means in particular that the conversion is rounded
4154 | according to the current rounding mode. If `a' is a NaN, the largest
4155 | positive integer is returned. Otherwise, if the conversion overflows, the
4156 | largest integer with the same sign as `a' is returned.
4157 *----------------------------------------------------------------------------*/
4158
4159 int64 float128_to_int64( float128 a )
4160 {
4161 flag aSign;
4162 int32 aExp, shiftCount;
4163 bits64 aSig0, aSig1;
4164
4165 aSig1 = extractFloat128Frac1( a );
4166 aSig0 = extractFloat128Frac0( a );
4167 aExp = extractFloat128Exp( a );
4168 aSign = extractFloat128Sign( a );
4169 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4170 shiftCount = 0x402F - aExp;
4171 if ( shiftCount <= 0 ) {
4172 if ( 0x403E < aExp ) {
4173 float_raise( float_flag_invalid );
4174 if ( ! aSign
4175 || ( ( aExp == 0x7FFF )
4176 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
4177 )
4178 ) {
4179 return LIT64( 0x7FFFFFFFFFFFFFFF );
4180 }
4181 return (sbits64) LIT64( 0x8000000000000000 );
4182 }
4183 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
4184 }
4185 else {
4186 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
4187 }
4188 return roundAndPackInt64( aSign, aSig0, aSig1 );
4189
4190 }
4191
4192 /*----------------------------------------------------------------------------
4193 | Returns the result of converting the quadruple-precision floating-point
4194 | value `a' to the 64-bit two's complement integer format. The conversion
4195 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4196 | Arithmetic, except that the conversion is always rounded toward zero.
4197 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
4198 | the conversion overflows, the largest integer with the same sign as `a' is
4199 | returned.
4200 *----------------------------------------------------------------------------*/
4201
4202 int64 float128_to_int64_round_to_zero( float128 a )
4203 {
4204 flag aSign;
4205 int32 aExp, shiftCount;
4206 bits64 aSig0, aSig1;
4207 int64 z;
4208
4209 aSig1 = extractFloat128Frac1( a );
4210 aSig0 = extractFloat128Frac0( a );
4211 aExp = extractFloat128Exp( a );
4212 aSign = extractFloat128Sign( a );
4213 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
4214 shiftCount = aExp - 0x402F;
4215 if ( 0 < shiftCount ) {
4216 if ( 0x403E <= aExp ) {
4217 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
4218 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
4219 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
4220 if ( aSig1 ) float_exception_flags |= float_flag_inexact;
4221 }
4222 else {
4223 float_raise( float_flag_invalid );
4224 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
4225 return LIT64( 0x7FFFFFFFFFFFFFFF );
4226 }
4227 }
4228 return (sbits64) LIT64( 0x8000000000000000 );
4229 }
4230 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
4231 if ( (bits64) ( aSig1<<shiftCount ) ) {
4232 float_exception_flags |= float_flag_inexact;
4233 }
4234 }
4235 else {
4236 if ( aExp < 0x3FFF ) {
4237 if ( aExp | aSig0 | aSig1 ) {
4238 float_exception_flags |= float_flag_inexact;
4239 }
4240 return 0;
4241 }
4242 z = aSig0>>( - shiftCount );
4243 if ( aSig1
4244 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
4245 float_exception_flags |= float_flag_inexact;
4246 }
4247 }
4248 if ( aSign ) z = - z;
4249 return z;
4250
4251 }
4252
4253 /*----------------------------------------------------------------------------
4254 | Returns the result of converting the quadruple-precision floating-point
4255 | value `a' to the single-precision floating-point format. The conversion
4256 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4257 | Arithmetic.
4258 *----------------------------------------------------------------------------*/
4259
4260 float32 float128_to_float32( float128 a )
4261 {
4262 flag aSign;
4263 int32 aExp;
4264 bits64 aSig0, aSig1;
4265 bits32 zSig;
4266
4267 aSig1 = extractFloat128Frac1( a );
4268 aSig0 = extractFloat128Frac0( a );
4269 aExp = extractFloat128Exp( a );
4270 aSign = extractFloat128Sign( a );
4271 if ( aExp == 0x7FFF ) {
4272 if ( aSig0 | aSig1 ) {
4273 return commonNaNToFloat32( float128ToCommonNaN( a ) );
4274 }
4275 return packFloat32( aSign, 0xFF, 0 );
4276 }
4277 aSig0 |= ( aSig1 != 0 );
4278 shift64RightJamming( aSig0, 18, &aSig0 );
4279 zSig = aSig0;
4280 if ( aExp || zSig ) {
4281 zSig |= 0x40000000;
4282 aExp -= 0x3F81;
4283 }
4284 return roundAndPackFloat32( aSign, aExp, zSig );
4285
4286 }
4287
4288 /*----------------------------------------------------------------------------
4289 | Returns the result of converting the quadruple-precision floating-point
4290 | value `a' to the double-precision floating-point format. The conversion
4291 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
4292 | Arithmetic.
4293 *----------------------------------------------------------------------------*/
4294
4295 float64 float128_to_float64( float128 a )
4296 {
4297 flag aSign;
4298 int32 aExp;
4299 bits64 aSig0, aSig1;
4300
4301 aSig1 = extractFloat128Frac1( a );
4302 aSig0 = extractFloat128Frac0( a );
4303 aExp = extractFloat128Exp( a );
4304 aSign = extractFloat128Sign( a );
4305 if ( aExp == 0x7FFF ) {
4306 if ( aSig0 | aSig1 ) {
4307 return commonNaNToFloat64( float128ToCommonNaN( a ) );
4308 }
4309 return packFloat64( aSign, 0x7FF, 0 );
4310 }
4311 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4312 aSig0 |= ( aSig1 != 0 );
4313 if ( aExp || aSig0 ) {
4314 aSig0 |= LIT64( 0x4000000000000000 );
4315 aExp -= 0x3C01;
4316 }
4317 return roundAndPackFloat64( aSign, aExp, aSig0 );
4318
4319 }
4320
4321 #ifdef FLOATX80
4322
4323 /*----------------------------------------------------------------------------
4324 | Returns the result of converting the quadruple-precision floating-point
4325 | value `a' to the extended double-precision floating-point format. The
4326 | conversion is performed according to the IEC/IEEE Standard for Binary
4327 | Floating-Point Arithmetic.
4328 *----------------------------------------------------------------------------*/
4329
4330 floatx80 float128_to_floatx80( float128 a )
4331 {
4332 flag aSign;
4333 int32 aExp;
4334 bits64 aSig0, aSig1;
4335
4336 aSig1 = extractFloat128Frac1( a );
4337 aSig0 = extractFloat128Frac0( a );
4338 aExp = extractFloat128Exp( a );
4339 aSign = extractFloat128Sign( a );
4340 if ( aExp == 0x7FFF ) {
4341 if ( aSig0 | aSig1 ) {
4342 return commonNaNToFloatx80( float128ToCommonNaN( a ) );
4343 }
4344 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
4345 }
4346 if ( aExp == 0 ) {
4347 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
4348 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4349 }
4350 else {
4351 aSig0 |= LIT64( 0x0001000000000000 );
4352 }
4353 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
4354 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 );
4355
4356 }
4357
4358 #endif
4359
4360 /*----------------------------------------------------------------------------
4361 | Rounds the quadruple-precision floating-point value `a' to an integer, and
4362 | returns the result as a quadruple-precision floating-point value. The
4363 | operation is performed according to the IEC/IEEE Standard for Binary
4364 | Floating-Point Arithmetic.
4365 *----------------------------------------------------------------------------*/
4366
4367 float128 float128_round_to_int( float128 a )
4368 {
4369 flag aSign;
4370 int32 aExp;
4371 bits64 lastBitMask, roundBitsMask;
4372 int8 roundingMode;
4373 float128 z;
4374
4375 aExp = extractFloat128Exp( a );
4376 if ( 0x402F <= aExp ) {
4377 if ( 0x406F <= aExp ) {
4378 if ( ( aExp == 0x7FFF )
4379 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
4380 ) {
4381 return propagateFloat128NaN( a, a );
4382 }
4383 return a;
4384 }
4385 lastBitMask = 1;
4386 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
4387 roundBitsMask = lastBitMask - 1;
4388 z = a;
4389 roundingMode = float_rounding_mode;
4390 if ( roundingMode == float_round_nearest_even ) {
4391 if ( lastBitMask ) {
4392 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
4393 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
4394 }
4395 else {
4396 if ( (sbits64) z.low < 0 ) {
4397 ++z.high;
4398 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
4399 }
4400 }
4401 }
4402 else if ( roundingMode != float_round_to_zero ) {
4403 if ( extractFloat128Sign( z )
4404 ^ ( roundingMode == float_round_up ) ) {
4405 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
4406 }
4407 }
4408 z.low &= ~ roundBitsMask;
4409 }
4410 else {
4411 if ( aExp < 0x3FFF ) {
4412 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
4413 float_exception_flags |= float_flag_inexact;
4414 aSign = extractFloat128Sign( a );
4415 switch ( float_rounding_mode ) {
4416 case float_round_nearest_even:
4417 if ( ( aExp == 0x3FFE )
4418 && ( extractFloat128Frac0( a )
4419 | extractFloat128Frac1( a ) )
4420 ) {
4421 return packFloat128( aSign, 0x3FFF, 0, 0 );
4422 }
4423 break;
4424 case float_round_down:
4425 return
4426 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
4427 : packFloat128( 0, 0, 0, 0 );
4428 case float_round_up:
4429 return
4430 aSign ? packFloat128( 1, 0, 0, 0 )
4431 : packFloat128( 0, 0x3FFF, 0, 0 );
4432 }
4433 return packFloat128( aSign, 0, 0, 0 );
4434 }
4435 lastBitMask = 1;
4436 lastBitMask <<= 0x402F - aExp;
4437 roundBitsMask = lastBitMask - 1;
4438 z.low = 0;
4439 z.high = a.high;
4440 roundingMode = float_rounding_mode;
4441 if ( roundingMode == float_round_nearest_even ) {
4442 z.high += lastBitMask>>1;
4443 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
4444 z.high &= ~ lastBitMask;
4445 }
4446 }
4447 else if ( roundingMode != float_round_to_zero ) {
4448 if ( extractFloat128Sign( z )
4449 ^ ( roundingMode == float_round_up ) ) {
4450 z.high |= ( a.low != 0 );
4451 z.high += roundBitsMask;
4452 }
4453 }
4454 z.high &= ~ roundBitsMask;
4455 }
4456 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
4457 float_exception_flags |= float_flag_inexact;
4458 }
4459 return z;
4460
4461 }
4462
4463 /*----------------------------------------------------------------------------
4464 | Returns the result of adding the absolute values of the quadruple-precision
4465 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
4466 | before being returned. `zSign' is ignored if the result is a NaN.
4467 | The addition is performed according to the IEC/IEEE Standard for Binary
4468 | Floating-Point Arithmetic.
4469 *----------------------------------------------------------------------------*/
4470
4471 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign )
4472 {
4473 int32 aExp, bExp, zExp;
4474 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4475 int32 expDiff;
4476
4477 aSig1 = extractFloat128Frac1( a );
4478 aSig0 = extractFloat128Frac0( a );
4479 aExp = extractFloat128Exp( a );
4480 bSig1 = extractFloat128Frac1( b );
4481 bSig0 = extractFloat128Frac0( b );
4482 bExp = extractFloat128Exp( b );
4483 expDiff = aExp - bExp;
4484 if ( 0 < expDiff ) {
4485 if ( aExp == 0x7FFF ) {
4486 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4487 return a;
4488 }
4489 if ( bExp == 0 ) {
4490 --expDiff;
4491 }
4492 else {
4493 bSig0 |= LIT64( 0x0001000000000000 );
4494 }
4495 shift128ExtraRightJamming(
4496 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
4497 zExp = aExp;
4498 }
4499 else if ( expDiff < 0 ) {
4500 if ( bExp == 0x7FFF ) {
4501 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4502 return packFloat128( zSign, 0x7FFF, 0, 0 );
4503 }
4504 if ( aExp == 0 ) {
4505 ++expDiff;
4506 }
4507 else {
4508 aSig0 |= LIT64( 0x0001000000000000 );
4509 }
4510 shift128ExtraRightJamming(
4511 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
4512 zExp = bExp;
4513 }
4514 else {
4515 if ( aExp == 0x7FFF ) {
4516 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4517 return propagateFloat128NaN( a, b );
4518 }
4519 return a;
4520 }
4521 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4522 if ( aExp == 0 ) return packFloat128( zSign, 0, zSig0, zSig1 );
4523 zSig2 = 0;
4524 zSig0 |= LIT64( 0x0002000000000000 );
4525 zExp = aExp;
4526 goto shiftRight1;
4527 }
4528 aSig0 |= LIT64( 0x0001000000000000 );
4529 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4530 --zExp;
4531 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
4532 ++zExp;
4533 shiftRight1:
4534 shift128ExtraRightJamming(
4535 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4536 roundAndPack:
4537 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4538
4539 }
4540
4541 /*----------------------------------------------------------------------------
4542 | Returns the result of subtracting the absolute values of the quadruple-
4543 | precision floating-point values `a' and `b'. If `zSign' is 1, the
4544 | difference is negated before being returned. `zSign' is ignored if the
4545 | result is a NaN. The subtraction is performed according to the IEC/IEEE
4546 | Standard for Binary Floating-Point Arithmetic.
4547 *----------------------------------------------------------------------------*/
4548
4549 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign )
4550 {
4551 int32 aExp, bExp, zExp;
4552 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
4553 int32 expDiff;
4554 float128 z;
4555
4556 aSig1 = extractFloat128Frac1( a );
4557 aSig0 = extractFloat128Frac0( a );
4558 aExp = extractFloat128Exp( a );
4559 bSig1 = extractFloat128Frac1( b );
4560 bSig0 = extractFloat128Frac0( b );
4561 bExp = extractFloat128Exp( b );
4562 expDiff = aExp - bExp;
4563 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
4564 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
4565 if ( 0 < expDiff ) goto aExpBigger;
4566 if ( expDiff < 0 ) goto bExpBigger;
4567 if ( aExp == 0x7FFF ) {
4568 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
4569 return propagateFloat128NaN( a, b );
4570 }
4571 float_raise( float_flag_invalid );
4572 z.low = float128_default_nan_low;
4573 z.high = float128_default_nan_high;
4574 return z;
4575 }
4576 if ( aExp == 0 ) {
4577 aExp = 1;
4578 bExp = 1;
4579 }
4580 if ( bSig0 < aSig0 ) goto aBigger;
4581 if ( aSig0 < bSig0 ) goto bBigger;
4582 if ( bSig1 < aSig1 ) goto aBigger;
4583 if ( aSig1 < bSig1 ) goto bBigger;
4584 return packFloat128( float_rounding_mode == float_round_down, 0, 0, 0 );
4585 bExpBigger:
4586 if ( bExp == 0x7FFF ) {
4587 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4588 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
4589 }
4590 if ( aExp == 0 ) {
4591 ++expDiff;
4592 }
4593 else {
4594 aSig0 |= LIT64( 0x4000000000000000 );
4595 }
4596 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4597 bSig0 |= LIT64( 0x4000000000000000 );
4598 bBigger:
4599 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
4600 zExp = bExp;
4601 zSign ^= 1;
4602 goto normalizeRoundAndPack;
4603 aExpBigger:
4604 if ( aExp == 0x7FFF ) {
4605 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4606 return a;
4607 }
4608 if ( bExp == 0 ) {
4609 --expDiff;
4610 }
4611 else {
4612 bSig0 |= LIT64( 0x4000000000000000 );
4613 }
4614 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
4615 aSig0 |= LIT64( 0x4000000000000000 );
4616 aBigger:
4617 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
4618 zExp = aExp;
4619 normalizeRoundAndPack:
4620 --zExp;
4621 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 );
4622
4623 }
4624
4625 /*----------------------------------------------------------------------------
4626 | Returns the result of adding the quadruple-precision floating-point values
4627 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
4628 | for Binary Floating-Point Arithmetic.
4629 *----------------------------------------------------------------------------*/
4630
4631 float128 float128_add( float128 a, float128 b )
4632 {
4633 flag aSign, bSign;
4634
4635 aSign = extractFloat128Sign( a );
4636 bSign = extractFloat128Sign( b );
4637 if ( aSign == bSign ) {
4638 return addFloat128Sigs( a, b, aSign );
4639 }
4640 else {
4641 return subFloat128Sigs( a, b, aSign );
4642 }
4643
4644 }
4645
4646 /*----------------------------------------------------------------------------
4647 | Returns the result of subtracting the quadruple-precision floating-point
4648 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4649 | Standard for Binary Floating-Point Arithmetic.
4650 *----------------------------------------------------------------------------*/
4651
4652 float128 float128_sub( float128 a, float128 b )
4653 {
4654 flag aSign, bSign;
4655
4656 aSign = extractFloat128Sign( a );
4657 bSign = extractFloat128Sign( b );
4658 if ( aSign == bSign ) {
4659 return subFloat128Sigs( a, b, aSign );
4660 }
4661 else {
4662 return addFloat128Sigs( a, b, aSign );
4663 }
4664
4665 }
4666
4667 /*----------------------------------------------------------------------------
4668 | Returns the result of multiplying the quadruple-precision floating-point
4669 | values `a' and `b'. The operation is performed according to the IEC/IEEE
4670 | Standard for Binary Floating-Point Arithmetic.
4671 *----------------------------------------------------------------------------*/
4672
4673 float128 float128_mul( float128 a, float128 b )
4674 {
4675 flag aSign, bSign, zSign;
4676 int32 aExp, bExp, zExp;
4677 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
4678 float128 z;
4679
4680 aSig1 = extractFloat128Frac1( a );
4681 aSig0 = extractFloat128Frac0( a );
4682 aExp = extractFloat128Exp( a );
4683 aSign = extractFloat128Sign( a );
4684 bSig1 = extractFloat128Frac1( b );
4685 bSig0 = extractFloat128Frac0( b );
4686 bExp = extractFloat128Exp( b );
4687 bSign = extractFloat128Sign( b );
4688 zSign = aSign ^ bSign;
4689 if ( aExp == 0x7FFF ) {
4690 if ( ( aSig0 | aSig1 )
4691 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4692 return propagateFloat128NaN( a, b );
4693 }
4694 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
4695 return packFloat128( zSign, 0x7FFF, 0, 0 );
4696 }
4697 if ( bExp == 0x7FFF ) {
4698 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4699 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4700 invalid:
4701 float_raise( float_flag_invalid );
4702 z.low = float128_default_nan_low;
4703 z.high = float128_default_nan_high;
4704 return z;
4705 }
4706 return packFloat128( zSign, 0x7FFF, 0, 0 );
4707 }
4708 if ( aExp == 0 ) {
4709 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4710 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4711 }
4712 if ( bExp == 0 ) {
4713 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4714 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4715 }
4716 zExp = aExp + bExp - 0x4000;
4717 aSig0 |= LIT64( 0x0001000000000000 );
4718 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
4719 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
4720 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
4721 zSig2 |= ( zSig3 != 0 );
4722 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
4723 shift128ExtraRightJamming(
4724 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
4725 ++zExp;
4726 }
4727 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4728
4729 }
4730
4731 /*----------------------------------------------------------------------------
4732 | Returns the result of dividing the quadruple-precision floating-point value
4733 | `a' by the corresponding value `b'. The operation is performed according to
4734 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4735 *----------------------------------------------------------------------------*/
4736
4737 float128 float128_div( float128 a, float128 b )
4738 {
4739 flag aSign, bSign, zSign;
4740 int32 aExp, bExp, zExp;
4741 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
4742 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4743 float128 z;
4744
4745 aSig1 = extractFloat128Frac1( a );
4746 aSig0 = extractFloat128Frac0( a );
4747 aExp = extractFloat128Exp( a );
4748 aSign = extractFloat128Sign( a );
4749 bSig1 = extractFloat128Frac1( b );
4750 bSig0 = extractFloat128Frac0( b );
4751 bExp = extractFloat128Exp( b );
4752 bSign = extractFloat128Sign( b );
4753 zSign = aSign ^ bSign;
4754 if ( aExp == 0x7FFF ) {
4755 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b );
4756 if ( bExp == 0x7FFF ) {
4757 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4758 goto invalid;
4759 }
4760 return packFloat128( zSign, 0x7FFF, 0, 0 );
4761 }
4762 if ( bExp == 0x7FFF ) {
4763 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4764 return packFloat128( zSign, 0, 0, 0 );
4765 }
4766 if ( bExp == 0 ) {
4767 if ( ( bSig0 | bSig1 ) == 0 ) {
4768 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
4769 invalid:
4770 float_raise( float_flag_invalid );
4771 z.low = float128_default_nan_low;
4772 z.high = float128_default_nan_high;
4773 return z;
4774 }
4775 float_raise( float_flag_divbyzero );
4776 return packFloat128( zSign, 0x7FFF, 0, 0 );
4777 }
4778 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4779 }
4780 if ( aExp == 0 ) {
4781 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
4782 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4783 }
4784 zExp = aExp - bExp + 0x3FFD;
4785 shortShift128Left(
4786 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
4787 shortShift128Left(
4788 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4789 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
4790 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
4791 ++zExp;
4792 }
4793 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
4794 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
4795 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
4796 while ( (sbits64) rem0 < 0 ) {
4797 --zSig0;
4798 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
4799 }
4800 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
4801 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
4802 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
4803 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
4804 while ( (sbits64) rem1 < 0 ) {
4805 --zSig1;
4806 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
4807 }
4808 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4809 }
4810 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
4811 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 );
4812
4813 }
4814
4815 /*----------------------------------------------------------------------------
4816 | Returns the remainder of the quadruple-precision floating-point value `a'
4817 | with respect to the corresponding value `b'. The operation is performed
4818 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4819 *----------------------------------------------------------------------------*/
4820
4821 float128 float128_rem( float128 a, float128 b )
4822 {
4823 flag aSign, bSign, zSign;
4824 int32 aExp, bExp, expDiff;
4825 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
4826 bits64 allZero, alternateASig0, alternateASig1, sigMean1;
4827 sbits64 sigMean0;
4828 float128 z;
4829
4830 aSig1 = extractFloat128Frac1( a );
4831 aSig0 = extractFloat128Frac0( a );
4832 aExp = extractFloat128Exp( a );
4833 aSign = extractFloat128Sign( a );
4834 bSig1 = extractFloat128Frac1( b );
4835 bSig0 = extractFloat128Frac0( b );
4836 bExp = extractFloat128Exp( b );
4837 bSign = extractFloat128Sign( b );
4838 if ( aExp == 0x7FFF ) {
4839 if ( ( aSig0 | aSig1 )
4840 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
4841 return propagateFloat128NaN( a, b );
4842 }
4843 goto invalid;
4844 }
4845 if ( bExp == 0x7FFF ) {
4846 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b );
4847 return a;
4848 }
4849 if ( bExp == 0 ) {
4850 if ( ( bSig0 | bSig1 ) == 0 ) {
4851 invalid:
4852 float_raise( float_flag_invalid );
4853 z.low = float128_default_nan_low;
4854 z.high = float128_default_nan_high;
4855 return z;
4856 }
4857 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
4858 }
4859 if ( aExp == 0 ) {
4860 if ( ( aSig0 | aSig1 ) == 0 ) return a;
4861 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4862 }
4863 expDiff = aExp - bExp;
4864 if ( expDiff < -1 ) return a;
4865 shortShift128Left(
4866 aSig0 | LIT64( 0x0001000000000000 ),
4867 aSig1,
4868 15 - ( expDiff < 0 ),
4869 &aSig0,
4870 &aSig1
4871 );
4872 shortShift128Left(
4873 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
4874 q = le128( bSig0, bSig1, aSig0, aSig1 );
4875 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4876 expDiff -= 64;
4877 while ( 0 < expDiff ) {
4878 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4879 q = ( 4 < q ) ? q - 4 : 0;
4880 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4881 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
4882 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
4883 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
4884 expDiff -= 61;
4885 }
4886 if ( -64 < expDiff ) {
4887 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
4888 q = ( 4 < q ) ? q - 4 : 0;
4889 q >>= - expDiff;
4890 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4891 expDiff += 52;
4892 if ( expDiff < 0 ) {
4893 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
4894 }
4895 else {
4896 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
4897 }
4898 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
4899 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
4900 }
4901 else {
4902 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
4903 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
4904 }
4905 do {
4906 alternateASig0 = aSig0;
4907 alternateASig1 = aSig1;
4908 ++q;
4909 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
4910 } while ( 0 <= (sbits64) aSig0 );
4911 add128(
4912 aSig0, aSig1, alternateASig0, alternateASig1, (bits64*)&sigMean0, &sigMean1 );
4913 if ( ( sigMean0 < 0 )
4914 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
4915 aSig0 = alternateASig0;
4916 aSig1 = alternateASig1;
4917 }
4918 zSign = ( (sbits64) aSig0 < 0 );
4919 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
4920 return
4921 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 );
4922
4923 }
4924
4925 /*----------------------------------------------------------------------------
4926 | Returns the square root of the quadruple-precision floating-point value `a'.
4927 | The operation is performed according to the IEC/IEEE Standard for Binary
4928 | Floating-Point Arithmetic.
4929 *----------------------------------------------------------------------------*/
4930
4931 float128 float128_sqrt( float128 a )
4932 {
4933 flag aSign;
4934 int32 aExp, zExp;
4935 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
4936 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
4937 float128 z;
4938
4939 aSig1 = extractFloat128Frac1( a );
4940 aSig0 = extractFloat128Frac0( a );
4941 aExp = extractFloat128Exp( a );
4942 aSign = extractFloat128Sign( a );
4943 if ( aExp == 0x7FFF ) {
4944 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a );
4945 if ( ! aSign ) return a;
4946 goto invalid;
4947 }
4948 if ( aSign ) {
4949 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
4950 invalid:
4951 float_raise( float_flag_invalid );
4952 z.low = float128_default_nan_low;
4953 z.high = float128_default_nan_high;
4954 return z;
4955 }
4956 if ( aExp == 0 ) {
4957 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
4958 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
4959 }
4960 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
4961 aSig0 |= LIT64( 0x0001000000000000 );
4962 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
4963 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
4964 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
4965 doubleZSig0 = zSig0<<1;
4966 mul64To128( zSig0, zSig0, &term0, &term1 );
4967 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
4968 while ( (sbits64) rem0 < 0 ) {
4969 --zSig0;
4970 doubleZSig0 -= 2;
4971 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
4972 }
4973 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
4974 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
4975 if ( zSig1 == 0 ) zSig1 = 1;
4976 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
4977 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
4978 mul64To128( zSig1, zSig1, &term2, &term3 );
4979 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
4980 while ( (sbits64) rem1 < 0 ) {
4981 --zSig1;
4982 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
4983 term3 |= 1;
4984 term2 |= doubleZSig0;
4985 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
4986 }
4987 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
4988 }
4989 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
4990 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 );
4991
4992 }
4993
4994 /*----------------------------------------------------------------------------
4995 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
4996 | the corresponding value `b', and 0 otherwise. The comparison is performed
4997 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4998 *----------------------------------------------------------------------------*/
4999
5000 flag float128_eq( float128 a, float128 b )
5001 {
5002
5003 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5004 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5005 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5006 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5007 ) {
5008 if ( float128_is_signaling_nan( a )
5009 || float128_is_signaling_nan( b ) ) {
5010 float_raise( float_flag_invalid );
5011 }
5012 return 0;
5013 }
5014 return
5015 ( a.low == b.low )
5016 && ( ( a.high == b.high )
5017 || ( ( a.low == 0 )
5018 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5019 );
5020
5021 }
5022
5023 /*----------------------------------------------------------------------------
5024 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5025 | or equal to the corresponding value `b', and 0 otherwise. The comparison
5026 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5027 | Arithmetic.
5028 *----------------------------------------------------------------------------*/
5029
5030 flag float128_le( float128 a, float128 b )
5031 {
5032 flag aSign, bSign;
5033
5034 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5035 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5036 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5037 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5038 ) {
5039 float_raise( float_flag_invalid );
5040 return 0;
5041 }
5042 aSign = extractFloat128Sign( a );
5043 bSign = extractFloat128Sign( b );
5044 if ( aSign != bSign ) {
5045 return
5046 aSign
5047 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5048 == 0 );
5049 }
5050 return
5051 aSign ? le128( b.high, b.low, a.high, a.low )
5052 : le128( a.high, a.low, b.high, b.low );
5053
5054 }
5055
5056 /*----------------------------------------------------------------------------
5057 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5058 | the corresponding value `b', and 0 otherwise. The comparison is performed
5059 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5060 *----------------------------------------------------------------------------*/
5061
5062 flag float128_lt( float128 a, float128 b )
5063 {
5064 flag aSign, bSign;
5065
5066 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5067 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5068 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5069 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5070 ) {
5071 float_raise( float_flag_invalid );
5072 return 0;
5073 }
5074 aSign = extractFloat128Sign( a );
5075 bSign = extractFloat128Sign( b );
5076 if ( aSign != bSign ) {
5077 return
5078 aSign
5079 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5080 != 0 );
5081 }
5082 return
5083 aSign ? lt128( b.high, b.low, a.high, a.low )
5084 : lt128( a.high, a.low, b.high, b.low );
5085
5086 }
5087
5088 /*----------------------------------------------------------------------------
5089 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
5090 | the corresponding value `b', and 0 otherwise. The invalid exception is
5091 | raised if either operand is a NaN. Otherwise, the comparison is performed
5092 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5093 *----------------------------------------------------------------------------*/
5094
5095 flag float128_eq_signaling( float128 a, float128 b )
5096 {
5097
5098 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5099 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5100 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5101 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5102 ) {
5103 float_raise( float_flag_invalid );
5104 return 0;
5105 }
5106 return
5107 ( a.low == b.low )
5108 && ( ( a.high == b.high )
5109 || ( ( a.low == 0 )
5110 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
5111 );
5112
5113 }
5114
5115 /*----------------------------------------------------------------------------
5116 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5117 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5118 | cause an exception. Otherwise, the comparison is performed according to the
5119 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5120 *----------------------------------------------------------------------------*/
5121
5122 flag float128_le_quiet( float128 a, float128 b )
5123 {
5124 flag aSign, bSign;
5125
5126 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5127 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5128 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5129 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5130 ) {
5131 if ( float128_is_signaling_nan( a )
5132 || float128_is_signaling_nan( b ) ) {
5133 float_raise( float_flag_invalid );
5134 }
5135 return 0;
5136 }
5137 aSign = extractFloat128Sign( a );
5138 bSign = extractFloat128Sign( b );
5139 if ( aSign != bSign ) {
5140 return
5141 aSign
5142 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5143 == 0 );
5144 }
5145 return
5146 aSign ? le128( b.high, b.low, a.high, a.low )
5147 : le128( a.high, a.low, b.high, b.low );
5148
5149 }
5150
5151 /*----------------------------------------------------------------------------
5152 | Returns 1 if the quadruple-precision floating-point value `a' is less than
5153 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
5154 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
5155 | Standard for Binary Floating-Point Arithmetic.
5156 *----------------------------------------------------------------------------*/
5157
5158 flag float128_lt_quiet( float128 a, float128 b )
5159 {
5160 flag aSign, bSign;
5161
5162 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
5163 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
5164 || ( ( extractFloat128Exp( b ) == 0x7FFF )
5165 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
5166 ) {
5167 if ( float128_is_signaling_nan( a )
5168 || float128_is_signaling_nan( b ) ) {
5169 float_raise( float_flag_invalid );
5170 }
5171 return 0;
5172 }
5173 aSign = extractFloat128Sign( a );
5174 bSign = extractFloat128Sign( b );
5175 if ( aSign != bSign ) {
5176 return
5177 aSign
5178 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5179 != 0 );
5180 }
5181 return
5182 aSign ? lt128( b.high, b.low, a.high, a.low )
5183 : lt128( a.high, a.low, b.high, b.low );
5184
5185 }
5186
5187 #endif
5188