4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
19 ===============================================================================
20 This C source file is part of the SoftFloat IEC/IEEE Floating-point
21 Arithmetic Package, Release 2a.
23 Written by John R. Hauser. This work was made possible in part by the
24 International Computer Science Institute, located at Suite 600, 1947 Center
25 Street, Berkeley, California 94704. Funding was partially provided by the
26 National Science Foundation under grant MIP-9311980. The original version
27 of this code was written as part of a project to build a fixed-point vector
28 processor in collaboration with the University of California at Berkeley,
29 overseen by Profs. Nelson Morgan and John Wawrzynek. More information
30 is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
31 arithmetic/SoftFloat.html'.
33 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34 has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35 TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36 PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37 AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
39 Derivative works are acceptable, even for commercial purposes, so long as
40 (1) they include prominent notice that the work is derivative, and (2) they
41 include prominent notice akin to these four paragraphs for those parts of
42 this code that are retained.
44 ===============================================================================
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
78 /* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
82 /* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
87 #include "fpu/softfloat.h"
89 /* We only need stdlib for abort() */
92 /*----------------------------------------------------------------------------
93 | Primitive arithmetic functions, including multi-word arithmetic, and
94 | division and square root approximations. (Can be specialized to target if
96 *----------------------------------------------------------------------------*/
97 #include "softfloat-macros.h"
99 /*----------------------------------------------------------------------------
100 | Functions and definitions to determine: (1) whether tininess for underflow
101 | is detected before or after rounding by default, (2) what (if anything)
102 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
103 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104 | are propagated from function inputs to output. These details are target-
106 *----------------------------------------------------------------------------*/
107 #include "softfloat-specialize.h"
109 /*----------------------------------------------------------------------------
110 | Returns the fraction bits of the half-precision floating-point value `a'.
111 *----------------------------------------------------------------------------*/
113 static inline uint32_t extractFloat16Frac(float16 a)
115 return float16_val(a) & 0x3ff;
118 /*----------------------------------------------------------------------------
119 | Returns the exponent bits of the half-precision floating-point value `a'.
120 *----------------------------------------------------------------------------*/
122 static inline int_fast16_t extractFloat16Exp(float16 a)
124 return (float16_val(a) >> 10) & 0x1f;
127 /*----------------------------------------------------------------------------
128 | Returns the sign bit of the single-precision floating-point value `a'.
129 *----------------------------------------------------------------------------*/
131 static inline flag extractFloat16Sign(float16 a)
133 return float16_val(a)>>15;
136 /*----------------------------------------------------------------------------
137 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138 | and 7, and returns the properly rounded 32-bit integer corresponding to the
139 | input. If `zSign' is 1, the input is negated before being converted to an
140 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141 | is simply rounded to an integer, with the inexact exception raised if the
142 | input cannot be represented exactly as an integer. However, if the fixed-
143 | point input is too large, the invalid exception is raised and the largest
144 | positive or negative integer is returned.
145 *----------------------------------------------------------------------------*/
147 static int32 roundAndPackInt32(flag zSign, uint64_t absZ, float_status *status)
150 flag roundNearestEven;
151 int8 roundIncrement, roundBits;
154 roundingMode = status->float_rounding_mode;
155 roundNearestEven = ( roundingMode == float_round_nearest_even );
156 switch (roundingMode) {
157 case float_round_nearest_even:
158 case float_round_ties_away:
159 roundIncrement = 0x40;
161 case float_round_to_zero:
165 roundIncrement = zSign ? 0 : 0x7f;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
179 float_raise(float_flag_invalid, status);
180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
183 status->float_exception_flags |= float_flag_inexact;
189 /*----------------------------------------------------------------------------
190 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
191 | `absZ1', with binary point between bits 63 and 64 (between the input words),
192 | and returns the properly rounded 64-bit integer corresponding to the input.
193 | If `zSign' is 1, the input is negated before being converted to an integer.
194 | Ordinarily, the fixed-point input is simply rounded to an integer, with
195 | the inexact exception raised if the input cannot be represented exactly as
196 | an integer. However, if the fixed-point input is too large, the invalid
197 | exception is raised and the largest positive or negative integer is
199 *----------------------------------------------------------------------------*/
201 static int64 roundAndPackInt64(flag zSign, uint64_t absZ0, uint64_t absZ1,
202 float_status *status)
205 flag roundNearestEven, increment;
208 roundingMode = status->float_rounding_mode;
209 roundNearestEven = ( roundingMode == float_round_nearest_even );
210 switch (roundingMode) {
211 case float_round_nearest_even:
212 case float_round_ties_away:
213 increment = ((int64_t) absZ1 < 0);
215 case float_round_to_zero:
219 increment = !zSign && absZ1;
221 case float_round_down:
222 increment = zSign && absZ1;
229 if ( absZ0 == 0 ) goto overflow;
230 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
233 if ( zSign ) z = - z;
234 if ( z && ( ( z < 0 ) ^ zSign ) ) {
236 float_raise(float_flag_invalid, status);
238 zSign ? (int64_t) LIT64( 0x8000000000000000 )
239 : LIT64( 0x7FFFFFFFFFFFFFFF );
242 status->float_exception_flags |= float_flag_inexact;
248 /*----------------------------------------------------------------------------
249 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
250 | `absZ1', with binary point between bits 63 and 64 (between the input words),
251 | and returns the properly rounded 64-bit unsigned integer corresponding to the
252 | input. Ordinarily, the fixed-point input is simply rounded to an integer,
253 | with the inexact exception raised if the input cannot be represented exactly
254 | as an integer. However, if the fixed-point input is too large, the invalid
255 | exception is raised and the largest unsigned integer is returned.
256 *----------------------------------------------------------------------------*/
258 static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
259 uint64_t absZ1, float_status *status)
262 flag roundNearestEven, increment;
264 roundingMode = status->float_rounding_mode;
265 roundNearestEven = (roundingMode == float_round_nearest_even);
266 switch (roundingMode) {
267 case float_round_nearest_even:
268 case float_round_ties_away:
269 increment = ((int64_t)absZ1 < 0);
271 case float_round_to_zero:
275 increment = !zSign && absZ1;
277 case float_round_down:
278 increment = zSign && absZ1;
286 float_raise(float_flag_invalid, status);
287 return LIT64(0xFFFFFFFFFFFFFFFF);
289 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
292 if (zSign && absZ0) {
293 float_raise(float_flag_invalid, status);
298 status->float_exception_flags |= float_flag_inexact;
303 /*----------------------------------------------------------------------------
304 | Returns the fraction bits of the single-precision floating-point value `a'.
305 *----------------------------------------------------------------------------*/
307 static inline uint32_t extractFloat32Frac( float32 a )
310 return float32_val(a) & 0x007FFFFF;
314 /*----------------------------------------------------------------------------
315 | Returns the exponent bits of the single-precision floating-point value `a'.
316 *----------------------------------------------------------------------------*/
318 static inline int_fast16_t extractFloat32Exp(float32 a)
321 return ( float32_val(a)>>23 ) & 0xFF;
325 /*----------------------------------------------------------------------------
326 | Returns the sign bit of the single-precision floating-point value `a'.
327 *----------------------------------------------------------------------------*/
329 static inline flag extractFloat32Sign( float32 a )
332 return float32_val(a)>>31;
336 /*----------------------------------------------------------------------------
337 | If `a' is denormal and we are in flush-to-zero mode then set the
338 | input-denormal exception and return zero. Otherwise just return the value.
339 *----------------------------------------------------------------------------*/
340 float32 float32_squash_input_denormal(float32 a, float_status *status)
342 if (status->flush_inputs_to_zero) {
343 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
344 float_raise(float_flag_input_denormal, status);
345 return make_float32(float32_val(a) & 0x80000000);
351 /*----------------------------------------------------------------------------
352 | Normalizes the subnormal single-precision floating-point value represented
353 | by the denormalized significand `aSig'. The normalized exponent and
354 | significand are stored at the locations pointed to by `zExpPtr' and
355 | `zSigPtr', respectively.
356 *----------------------------------------------------------------------------*/
359 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
363 shiftCount = countLeadingZeros32( aSig ) - 8;
364 *zSigPtr = aSig<<shiftCount;
365 *zExpPtr = 1 - shiftCount;
369 /*----------------------------------------------------------------------------
370 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
371 | single-precision floating-point value, returning the result. After being
372 | shifted into the proper positions, the three fields are simply added
373 | together to form the result. This means that any integer portion of `zSig'
374 | will be added into the exponent. Since a properly normalized significand
375 | will have an integer portion equal to 1, the `zExp' input should be 1 less
376 | than the desired result exponent whenever `zSig' is a complete, normalized
378 *----------------------------------------------------------------------------*/
380 static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
384 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
388 /*----------------------------------------------------------------------------
389 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
390 | and significand `zSig', and returns the proper single-precision floating-
391 | point value corresponding to the abstract input. Ordinarily, the abstract
392 | value is simply rounded and packed into the single-precision format, with
393 | the inexact exception raised if the abstract input cannot be represented
394 | exactly. However, if the abstract value is too large, the overflow and
395 | inexact exceptions are raised and an infinity or maximal finite value is
396 | returned. If the abstract value is too small, the input value is rounded to
397 | a subnormal number, and the underflow and inexact exceptions are raised if
398 | the abstract input cannot be represented exactly as a subnormal single-
399 | precision floating-point number.
400 | The input significand `zSig' has its binary point between bits 30
401 | and 29, which is 7 bits to the left of the usual location. This shifted
402 | significand must be normalized or smaller. If `zSig' is not normalized,
403 | `zExp' must be 0; in that case, the result returned is a subnormal number,
404 | and it must not require rounding. In the usual case that `zSig' is
405 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
406 | The handling of underflow and overflow follows the IEC/IEEE Standard for
407 | Binary Floating-Point Arithmetic.
408 *----------------------------------------------------------------------------*/
410 static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
411 float_status *status)
414 flag roundNearestEven;
415 int8 roundIncrement, roundBits;
418 roundingMode = status->float_rounding_mode;
419 roundNearestEven = ( roundingMode == float_round_nearest_even );
420 switch (roundingMode) {
421 case float_round_nearest_even:
422 case float_round_ties_away:
423 roundIncrement = 0x40;
425 case float_round_to_zero:
429 roundIncrement = zSign ? 0 : 0x7f;
431 case float_round_down:
432 roundIncrement = zSign ? 0x7f : 0;
438 roundBits = zSig & 0x7F;
439 if ( 0xFD <= (uint16_t) zExp ) {
441 || ( ( zExp == 0xFD )
442 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
444 float_raise(float_flag_overflow | float_flag_inexact, status);
445 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
448 if (status->flush_to_zero) {
449 float_raise(float_flag_output_denormal, status);
450 return packFloat32(zSign, 0, 0);
453 (status->float_detect_tininess
454 == float_tininess_before_rounding)
456 || ( zSig + roundIncrement < 0x80000000 );
457 shift32RightJamming( zSig, - zExp, &zSig );
459 roundBits = zSig & 0x7F;
460 if (isTiny && roundBits) {
461 float_raise(float_flag_underflow, status);
466 status->float_exception_flags |= float_flag_inexact;
468 zSig = ( zSig + roundIncrement )>>7;
469 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
470 if ( zSig == 0 ) zExp = 0;
471 return packFloat32( zSign, zExp, zSig );
475 /*----------------------------------------------------------------------------
476 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
477 | and significand `zSig', and returns the proper single-precision floating-
478 | point value corresponding to the abstract input. This routine is just like
479 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
480 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
481 | floating-point exponent.
482 *----------------------------------------------------------------------------*/
485 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig,
486 float_status *status)
490 shiftCount = countLeadingZeros32( zSig ) - 1;
491 return roundAndPackFloat32(zSign, zExp - shiftCount, zSig<<shiftCount,
496 /*----------------------------------------------------------------------------
497 | Returns the fraction bits of the double-precision floating-point value `a'.
498 *----------------------------------------------------------------------------*/
500 static inline uint64_t extractFloat64Frac( float64 a )
503 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
507 /*----------------------------------------------------------------------------
508 | Returns the exponent bits of the double-precision floating-point value `a'.
509 *----------------------------------------------------------------------------*/
511 static inline int_fast16_t extractFloat64Exp(float64 a)
514 return ( float64_val(a)>>52 ) & 0x7FF;
518 /*----------------------------------------------------------------------------
519 | Returns the sign bit of the double-precision floating-point value `a'.
520 *----------------------------------------------------------------------------*/
522 static inline flag extractFloat64Sign( float64 a )
525 return float64_val(a)>>63;
529 /*----------------------------------------------------------------------------
530 | If `a' is denormal and we are in flush-to-zero mode then set the
531 | input-denormal exception and return zero. Otherwise just return the value.
532 *----------------------------------------------------------------------------*/
533 float64 float64_squash_input_denormal(float64 a, float_status *status)
535 if (status->flush_inputs_to_zero) {
536 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
537 float_raise(float_flag_input_denormal, status);
538 return make_float64(float64_val(a) & (1ULL << 63));
544 /*----------------------------------------------------------------------------
545 | Normalizes the subnormal double-precision floating-point value represented
546 | by the denormalized significand `aSig'. The normalized exponent and
547 | significand are stored at the locations pointed to by `zExpPtr' and
548 | `zSigPtr', respectively.
549 *----------------------------------------------------------------------------*/
552 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
556 shiftCount = countLeadingZeros64( aSig ) - 11;
557 *zSigPtr = aSig<<shiftCount;
558 *zExpPtr = 1 - shiftCount;
562 /*----------------------------------------------------------------------------
563 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
564 | double-precision floating-point value, returning the result. After being
565 | shifted into the proper positions, the three fields are simply added
566 | together to form the result. This means that any integer portion of `zSig'
567 | will be added into the exponent. Since a properly normalized significand
568 | will have an integer portion equal to 1, the `zExp' input should be 1 less
569 | than the desired result exponent whenever `zSig' is a complete, normalized
571 *----------------------------------------------------------------------------*/
573 static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
577 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
581 /*----------------------------------------------------------------------------
582 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
583 | and significand `zSig', and returns the proper double-precision floating-
584 | point value corresponding to the abstract input. Ordinarily, the abstract
585 | value is simply rounded and packed into the double-precision format, with
586 | the inexact exception raised if the abstract input cannot be represented
587 | exactly. However, if the abstract value is too large, the overflow and
588 | inexact exceptions are raised and an infinity or maximal finite value is
589 | returned. If the abstract value is too small, the input value is rounded to
590 | a subnormal number, and the underflow and inexact exceptions are raised if
591 | the abstract input cannot be represented exactly as a subnormal double-
592 | precision floating-point number.
593 | The input significand `zSig' has its binary point between bits 62
594 | and 61, which is 10 bits to the left of the usual location. This shifted
595 | significand must be normalized or smaller. If `zSig' is not normalized,
596 | `zExp' must be 0; in that case, the result returned is a subnormal number,
597 | and it must not require rounding. In the usual case that `zSig' is
598 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
599 | The handling of underflow and overflow follows the IEC/IEEE Standard for
600 | Binary Floating-Point Arithmetic.
601 *----------------------------------------------------------------------------*/
603 static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
604 float_status *status)
607 flag roundNearestEven;
608 int_fast16_t roundIncrement, roundBits;
611 roundingMode = status->float_rounding_mode;
612 roundNearestEven = ( roundingMode == float_round_nearest_even );
613 switch (roundingMode) {
614 case float_round_nearest_even:
615 case float_round_ties_away:
616 roundIncrement = 0x200;
618 case float_round_to_zero:
622 roundIncrement = zSign ? 0 : 0x3ff;
624 case float_round_down:
625 roundIncrement = zSign ? 0x3ff : 0;
630 roundBits = zSig & 0x3FF;
631 if ( 0x7FD <= (uint16_t) zExp ) {
632 if ( ( 0x7FD < zExp )
633 || ( ( zExp == 0x7FD )
634 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
636 float_raise(float_flag_overflow | float_flag_inexact, status);
637 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
640 if (status->flush_to_zero) {
641 float_raise(float_flag_output_denormal, status);
642 return packFloat64(zSign, 0, 0);
645 (status->float_detect_tininess
646 == float_tininess_before_rounding)
648 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
649 shift64RightJamming( zSig, - zExp, &zSig );
651 roundBits = zSig & 0x3FF;
652 if (isTiny && roundBits) {
653 float_raise(float_flag_underflow, status);
658 status->float_exception_flags |= float_flag_inexact;
660 zSig = ( zSig + roundIncrement )>>10;
661 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
662 if ( zSig == 0 ) zExp = 0;
663 return packFloat64( zSign, zExp, zSig );
667 /*----------------------------------------------------------------------------
668 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
669 | and significand `zSig', and returns the proper double-precision floating-
670 | point value corresponding to the abstract input. This routine is just like
671 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
672 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
673 | floating-point exponent.
674 *----------------------------------------------------------------------------*/
677 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig,
678 float_status *status)
682 shiftCount = countLeadingZeros64( zSig ) - 1;
683 return roundAndPackFloat64(zSign, zExp - shiftCount, zSig<<shiftCount,
688 /*----------------------------------------------------------------------------
689 | Returns the fraction bits of the extended double-precision floating-point
691 *----------------------------------------------------------------------------*/
693 static inline uint64_t extractFloatx80Frac( floatx80 a )
700 /*----------------------------------------------------------------------------
701 | Returns the exponent bits of the extended double-precision floating-point
703 *----------------------------------------------------------------------------*/
705 static inline int32 extractFloatx80Exp( floatx80 a )
708 return a.high & 0x7FFF;
712 /*----------------------------------------------------------------------------
713 | Returns the sign bit of the extended double-precision floating-point value
715 *----------------------------------------------------------------------------*/
717 static inline flag extractFloatx80Sign( floatx80 a )
724 /*----------------------------------------------------------------------------
725 | Normalizes the subnormal extended double-precision floating-point value
726 | represented by the denormalized significand `aSig'. The normalized exponent
727 | and significand are stored at the locations pointed to by `zExpPtr' and
728 | `zSigPtr', respectively.
729 *----------------------------------------------------------------------------*/
732 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
736 shiftCount = countLeadingZeros64( aSig );
737 *zSigPtr = aSig<<shiftCount;
738 *zExpPtr = 1 - shiftCount;
742 /*----------------------------------------------------------------------------
743 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
744 | extended double-precision floating-point value, returning the result.
745 *----------------------------------------------------------------------------*/
747 static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
752 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
757 /*----------------------------------------------------------------------------
758 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
759 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
760 | and returns the proper extended double-precision floating-point value
761 | corresponding to the abstract input. Ordinarily, the abstract value is
762 | rounded and packed into the extended double-precision format, with the
763 | inexact exception raised if the abstract input cannot be represented
764 | exactly. However, if the abstract value is too large, the overflow and
765 | inexact exceptions are raised and an infinity or maximal finite value is
766 | returned. If the abstract value is too small, the input value is rounded to
767 | a subnormal number, and the underflow and inexact exceptions are raised if
768 | the abstract input cannot be represented exactly as a subnormal extended
769 | double-precision floating-point number.
770 | If `roundingPrecision' is 32 or 64, the result is rounded to the same
771 | number of bits as single or double precision, respectively. Otherwise, the
772 | result is rounded to the full precision of the extended double-precision
774 | The input significand must be normalized or smaller. If the input
775 | significand is not normalized, `zExp' must be 0; in that case, the result
776 | returned is a subnormal number, and it must not require rounding. The
777 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
778 | Floating-Point Arithmetic.
779 *----------------------------------------------------------------------------*/
781 static floatx80 roundAndPackFloatx80(int8 roundingPrecision, flag zSign,
782 int32 zExp, uint64_t zSig0, uint64_t zSig1,
783 float_status *status)
786 flag roundNearestEven, increment, isTiny;
787 int64 roundIncrement, roundMask, roundBits;
789 roundingMode = status->float_rounding_mode;
790 roundNearestEven = ( roundingMode == float_round_nearest_even );
791 if ( roundingPrecision == 80 ) goto precision80;
792 if ( roundingPrecision == 64 ) {
793 roundIncrement = LIT64( 0x0000000000000400 );
794 roundMask = LIT64( 0x00000000000007FF );
796 else if ( roundingPrecision == 32 ) {
797 roundIncrement = LIT64( 0x0000008000000000 );
798 roundMask = LIT64( 0x000000FFFFFFFFFF );
803 zSig0 |= ( zSig1 != 0 );
804 switch (roundingMode) {
805 case float_round_nearest_even:
806 case float_round_ties_away:
808 case float_round_to_zero:
812 roundIncrement = zSign ? 0 : roundMask;
814 case float_round_down:
815 roundIncrement = zSign ? roundMask : 0;
820 roundBits = zSig0 & roundMask;
821 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
822 if ( ( 0x7FFE < zExp )
823 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
828 if (status->flush_to_zero) {
829 float_raise(float_flag_output_denormal, status);
830 return packFloatx80(zSign, 0, 0);
833 (status->float_detect_tininess
834 == float_tininess_before_rounding)
836 || ( zSig0 <= zSig0 + roundIncrement );
837 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
839 roundBits = zSig0 & roundMask;
840 if (isTiny && roundBits) {
841 float_raise(float_flag_underflow, status);
844 status->float_exception_flags |= float_flag_inexact;
846 zSig0 += roundIncrement;
847 if ( (int64_t) zSig0 < 0 ) zExp = 1;
848 roundIncrement = roundMask + 1;
849 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
850 roundMask |= roundIncrement;
852 zSig0 &= ~ roundMask;
853 return packFloatx80( zSign, zExp, zSig0 );
857 status->float_exception_flags |= float_flag_inexact;
859 zSig0 += roundIncrement;
860 if ( zSig0 < roundIncrement ) {
862 zSig0 = LIT64( 0x8000000000000000 );
864 roundIncrement = roundMask + 1;
865 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
866 roundMask |= roundIncrement;
868 zSig0 &= ~ roundMask;
869 if ( zSig0 == 0 ) zExp = 0;
870 return packFloatx80( zSign, zExp, zSig0 );
872 switch (roundingMode) {
873 case float_round_nearest_even:
874 case float_round_ties_away:
875 increment = ((int64_t)zSig1 < 0);
877 case float_round_to_zero:
881 increment = !zSign && zSig1;
883 case float_round_down:
884 increment = zSign && zSig1;
889 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
890 if ( ( 0x7FFE < zExp )
891 || ( ( zExp == 0x7FFE )
892 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
898 float_raise(float_flag_overflow | float_flag_inexact, status);
899 if ( ( roundingMode == float_round_to_zero )
900 || ( zSign && ( roundingMode == float_round_up ) )
901 || ( ! zSign && ( roundingMode == float_round_down ) )
903 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
905 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
909 (status->float_detect_tininess
910 == float_tininess_before_rounding)
913 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
914 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
916 if (isTiny && zSig1) {
917 float_raise(float_flag_underflow, status);
920 status->float_exception_flags |= float_flag_inexact;
922 switch (roundingMode) {
923 case float_round_nearest_even:
924 case float_round_ties_away:
925 increment = ((int64_t)zSig1 < 0);
927 case float_round_to_zero:
931 increment = !zSign && zSig1;
933 case float_round_down:
934 increment = zSign && zSig1;
942 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
943 if ( (int64_t) zSig0 < 0 ) zExp = 1;
945 return packFloatx80( zSign, zExp, zSig0 );
949 status->float_exception_flags |= float_flag_inexact;
955 zSig0 = LIT64( 0x8000000000000000 );
958 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
962 if ( zSig0 == 0 ) zExp = 0;
964 return packFloatx80( zSign, zExp, zSig0 );
968 /*----------------------------------------------------------------------------
969 | Takes an abstract floating-point value having sign `zSign', exponent
970 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
971 | and returns the proper extended double-precision floating-point value
972 | corresponding to the abstract input. This routine is just like
973 | `roundAndPackFloatx80' except that the input significand does not have to be
975 *----------------------------------------------------------------------------*/
977 static floatx80 normalizeRoundAndPackFloatx80(int8 roundingPrecision,
978 flag zSign, int32 zExp,
979 uint64_t zSig0, uint64_t zSig1,
980 float_status *status)
989 shiftCount = countLeadingZeros64( zSig0 );
990 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
992 return roundAndPackFloatx80(roundingPrecision, zSign, zExp,
993 zSig0, zSig1, status);
997 /*----------------------------------------------------------------------------
998 | Returns the least-significant 64 fraction bits of the quadruple-precision
999 | floating-point value `a'.
1000 *----------------------------------------------------------------------------*/
1002 static inline uint64_t extractFloat128Frac1( float128 a )
1009 /*----------------------------------------------------------------------------
1010 | Returns the most-significant 48 fraction bits of the quadruple-precision
1011 | floating-point value `a'.
1012 *----------------------------------------------------------------------------*/
1014 static inline uint64_t extractFloat128Frac0( float128 a )
1017 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
1021 /*----------------------------------------------------------------------------
1022 | Returns the exponent bits of the quadruple-precision floating-point value
1024 *----------------------------------------------------------------------------*/
1026 static inline int32 extractFloat128Exp( float128 a )
1029 return ( a.high>>48 ) & 0x7FFF;
1033 /*----------------------------------------------------------------------------
1034 | Returns the sign bit of the quadruple-precision floating-point value `a'.
1035 *----------------------------------------------------------------------------*/
1037 static inline flag extractFloat128Sign( float128 a )
1044 /*----------------------------------------------------------------------------
1045 | Normalizes the subnormal quadruple-precision floating-point value
1046 | represented by the denormalized significand formed by the concatenation of
1047 | `aSig0' and `aSig1'. The normalized exponent is stored at the location
1048 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1049 | significand are stored at the location pointed to by `zSig0Ptr', and the
1050 | least significant 64 bits of the normalized significand are stored at the
1051 | location pointed to by `zSig1Ptr'.
1052 *----------------------------------------------------------------------------*/
1055 normalizeFloat128Subnormal(
1066 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1067 if ( shiftCount < 0 ) {
1068 *zSig0Ptr = aSig1>>( - shiftCount );
1069 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1072 *zSig0Ptr = aSig1<<shiftCount;
1075 *zExpPtr = - shiftCount - 63;
1078 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1079 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1080 *zExpPtr = 1 - shiftCount;
1085 /*----------------------------------------------------------------------------
1086 | Packs the sign `zSign', the exponent `zExp', and the significand formed
1087 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1088 | floating-point value, returning the result. After being shifted into the
1089 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1090 | added together to form the most significant 32 bits of the result. This
1091 | means that any integer portion of `zSig0' will be added into the exponent.
1092 | Since a properly normalized significand will have an integer portion equal
1093 | to 1, the `zExp' input should be 1 less than the desired result exponent
1094 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1096 *----------------------------------------------------------------------------*/
1098 static inline float128
1099 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
1104 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
1109 /*----------------------------------------------------------------------------
1110 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1111 | and extended significand formed by the concatenation of `zSig0', `zSig1',
1112 | and `zSig2', and returns the proper quadruple-precision floating-point value
1113 | corresponding to the abstract input. Ordinarily, the abstract value is
1114 | simply rounded and packed into the quadruple-precision format, with the
1115 | inexact exception raised if the abstract input cannot be represented
1116 | exactly. However, if the abstract value is too large, the overflow and
1117 | inexact exceptions are raised and an infinity or maximal finite value is
1118 | returned. If the abstract value is too small, the input value is rounded to
1119 | a subnormal number, and the underflow and inexact exceptions are raised if
1120 | the abstract input cannot be represented exactly as a subnormal quadruple-
1121 | precision floating-point number.
1122 | The input significand must be normalized or smaller. If the input
1123 | significand is not normalized, `zExp' must be 0; in that case, the result
1124 | returned is a subnormal number, and it must not require rounding. In the
1125 | usual case that the input significand is normalized, `zExp' must be 1 less
1126 | than the ``true'' floating-point exponent. The handling of underflow and
1127 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1128 *----------------------------------------------------------------------------*/
1130 static float128 roundAndPackFloat128(flag zSign, int32 zExp,
1131 uint64_t zSig0, uint64_t zSig1,
1132 uint64_t zSig2, float_status *status)
1135 flag roundNearestEven, increment, isTiny;
1137 roundingMode = status->float_rounding_mode;
1138 roundNearestEven = ( roundingMode == float_round_nearest_even );
1139 switch (roundingMode) {
1140 case float_round_nearest_even:
1141 case float_round_ties_away:
1142 increment = ((int64_t)zSig2 < 0);
1144 case float_round_to_zero:
1147 case float_round_up:
1148 increment = !zSign && zSig2;
1150 case float_round_down:
1151 increment = zSign && zSig2;
1156 if ( 0x7FFD <= (uint32_t) zExp ) {
1157 if ( ( 0x7FFD < zExp )
1158 || ( ( zExp == 0x7FFD )
1160 LIT64( 0x0001FFFFFFFFFFFF ),
1161 LIT64( 0xFFFFFFFFFFFFFFFF ),
1168 float_raise(float_flag_overflow | float_flag_inexact, status);
1169 if ( ( roundingMode == float_round_to_zero )
1170 || ( zSign && ( roundingMode == float_round_up ) )
1171 || ( ! zSign && ( roundingMode == float_round_down ) )
1177 LIT64( 0x0000FFFFFFFFFFFF ),
1178 LIT64( 0xFFFFFFFFFFFFFFFF )
1181 return packFloat128( zSign, 0x7FFF, 0, 0 );
1184 if (status->flush_to_zero) {
1185 float_raise(float_flag_output_denormal, status);
1186 return packFloat128(zSign, 0, 0, 0);
1189 (status->float_detect_tininess
1190 == float_tininess_before_rounding)
1196 LIT64( 0x0001FFFFFFFFFFFF ),
1197 LIT64( 0xFFFFFFFFFFFFFFFF )
1199 shift128ExtraRightJamming(
1200 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1202 if (isTiny && zSig2) {
1203 float_raise(float_flag_underflow, status);
1205 switch (roundingMode) {
1206 case float_round_nearest_even:
1207 case float_round_ties_away:
1208 increment = ((int64_t)zSig2 < 0);
1210 case float_round_to_zero:
1213 case float_round_up:
1214 increment = !zSign && zSig2;
1216 case float_round_down:
1217 increment = zSign && zSig2;
1225 status->float_exception_flags |= float_flag_inexact;
1228 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1229 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1232 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1234 return packFloat128( zSign, zExp, zSig0, zSig1 );
1238 /*----------------------------------------------------------------------------
1239 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1240 | and significand formed by the concatenation of `zSig0' and `zSig1', and
1241 | returns the proper quadruple-precision floating-point value corresponding
1242 | to the abstract input. This routine is just like `roundAndPackFloat128'
1243 | except that the input significand has fewer bits and does not have to be
1244 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1246 *----------------------------------------------------------------------------*/
1248 static float128 normalizeRoundAndPackFloat128(flag zSign, int32 zExp,
1249 uint64_t zSig0, uint64_t zSig1,
1250 float_status *status)
1260 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1261 if ( 0 <= shiftCount ) {
1263 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1266 shift128ExtraRightJamming(
1267 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1270 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
1274 /*----------------------------------------------------------------------------
1275 | Returns the result of converting the 32-bit two's complement integer `a'
1276 | to the single-precision floating-point format. The conversion is performed
1277 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1278 *----------------------------------------------------------------------------*/
1280 float32 int32_to_float32(int32_t a, float_status *status)
1284 if ( a == 0 ) return float32_zero;
1285 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
1287 return normalizeRoundAndPackFloat32(zSign, 0x9C, zSign ? -a : a, status);
1290 /*----------------------------------------------------------------------------
1291 | Returns the result of converting the 32-bit two's complement integer `a'
1292 | to the double-precision floating-point format. The conversion is performed
1293 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1294 *----------------------------------------------------------------------------*/
1296 float64 int32_to_float64(int32_t a, float_status *status)
1303 if ( a == 0 ) return float64_zero;
1305 absA = zSign ? - a : a;
1306 shiftCount = countLeadingZeros32( absA ) + 21;
1308 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1312 /*----------------------------------------------------------------------------
1313 | Returns the result of converting the 32-bit two's complement integer `a'
1314 | to the extended double-precision floating-point format. The conversion
1315 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1317 *----------------------------------------------------------------------------*/
1319 floatx80 int32_to_floatx80(int32_t a, float_status *status)
1326 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1328 absA = zSign ? - a : a;
1329 shiftCount = countLeadingZeros32( absA ) + 32;
1331 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1335 /*----------------------------------------------------------------------------
1336 | Returns the result of converting the 32-bit two's complement integer `a' to
1337 | the quadruple-precision floating-point format. The conversion is performed
1338 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1339 *----------------------------------------------------------------------------*/
1341 float128 int32_to_float128(int32_t a, float_status *status)
1348 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1350 absA = zSign ? - a : a;
1351 shiftCount = countLeadingZeros32( absA ) + 17;
1353 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1357 /*----------------------------------------------------------------------------
1358 | Returns the result of converting the 64-bit two's complement integer `a'
1359 | to the single-precision floating-point format. The conversion is performed
1360 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1361 *----------------------------------------------------------------------------*/
1363 float32 int64_to_float32(int64_t a, float_status *status)
1369 if ( a == 0 ) return float32_zero;
1371 absA = zSign ? - a : a;
1372 shiftCount = countLeadingZeros64( absA ) - 40;
1373 if ( 0 <= shiftCount ) {
1374 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1378 if ( shiftCount < 0 ) {
1379 shift64RightJamming( absA, - shiftCount, &absA );
1382 absA <<= shiftCount;
1384 return roundAndPackFloat32(zSign, 0x9C - shiftCount, absA, status);
1389 /*----------------------------------------------------------------------------
1390 | Returns the result of converting the 64-bit two's complement integer `a'
1391 | to the double-precision floating-point format. The conversion is performed
1392 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1393 *----------------------------------------------------------------------------*/
1395 float64 int64_to_float64(int64_t a, float_status *status)
1399 if ( a == 0 ) return float64_zero;
1400 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
1401 return packFloat64( 1, 0x43E, 0 );
1404 return normalizeRoundAndPackFloat64(zSign, 0x43C, zSign ? -a : a, status);
1407 /*----------------------------------------------------------------------------
1408 | Returns the result of converting the 64-bit two's complement integer `a'
1409 | to the extended double-precision floating-point format. The conversion
1410 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1412 *----------------------------------------------------------------------------*/
1414 floatx80 int64_to_floatx80(int64_t a, float_status *status)
1420 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1422 absA = zSign ? - a : a;
1423 shiftCount = countLeadingZeros64( absA );
1424 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1428 /*----------------------------------------------------------------------------
1429 | Returns the result of converting the 64-bit two's complement integer `a' to
1430 | the quadruple-precision floating-point format. The conversion is performed
1431 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1432 *----------------------------------------------------------------------------*/
1434 float128 int64_to_float128(int64_t a, float_status *status)
1440 uint64_t zSig0, zSig1;
1442 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1444 absA = zSign ? - a : a;
1445 shiftCount = countLeadingZeros64( absA ) + 49;
1446 zExp = 0x406E - shiftCount;
1447 if ( 64 <= shiftCount ) {
1456 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1457 return packFloat128( zSign, zExp, zSig0, zSig1 );
1461 /*----------------------------------------------------------------------------
1462 | Returns the result of converting the 64-bit unsigned integer `a'
1463 | to the single-precision floating-point format. The conversion is performed
1464 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1465 *----------------------------------------------------------------------------*/
1467 float32 uint64_to_float32(uint64_t a, float_status *status)
1472 return float32_zero;
1475 /* Determine (left) shift needed to put first set bit into bit posn 23
1476 * (since packFloat32() expects the binary point between bits 23 and 22);
1477 * this is the fast case for smallish numbers.
1479 shiftcount = countLeadingZeros64(a) - 40;
1480 if (shiftcount >= 0) {
1481 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1483 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1484 * expects the binary point between bits 30 and 29, hence the + 7.
1487 if (shiftcount < 0) {
1488 shift64RightJamming(a, -shiftcount, &a);
1493 return roundAndPackFloat32(0, 0x9c - shiftcount, a, status);
1496 /*----------------------------------------------------------------------------
1497 | Returns the result of converting the 64-bit unsigned integer `a'
1498 | to the double-precision floating-point format. The conversion is performed
1499 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1500 *----------------------------------------------------------------------------*/
1502 float64 uint64_to_float64(uint64_t a, float_status *status)
1508 return float64_zero;
1511 shiftcount = countLeadingZeros64(a) - 1;
1512 if (shiftcount < 0) {
1513 shift64RightJamming(a, -shiftcount, &a);
1517 return roundAndPackFloat64(0, exp - shiftcount, a, status);
1520 /*----------------------------------------------------------------------------
1521 | Returns the result of converting the 64-bit unsigned integer `a'
1522 | to the quadruple-precision floating-point format. The conversion is performed
1523 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1524 *----------------------------------------------------------------------------*/
1526 float128 uint64_to_float128(uint64_t a, float_status *status)
1529 return float128_zero;
1531 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0, status);
1534 /*----------------------------------------------------------------------------
1535 | Returns the result of converting the single-precision floating-point value
1536 | `a' to the 32-bit two's complement integer format. The conversion is
1537 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1538 | Arithmetic---which means in particular that the conversion is rounded
1539 | according to the current rounding mode. If `a' is a NaN, the largest
1540 | positive integer is returned. Otherwise, if the conversion overflows, the
1541 | largest integer with the same sign as `a' is returned.
1542 *----------------------------------------------------------------------------*/
1544 int32 float32_to_int32(float32 a, float_status *status)
1547 int_fast16_t aExp, shiftCount;
1551 a = float32_squash_input_denormal(a, status);
1552 aSig = extractFloat32Frac( a );
1553 aExp = extractFloat32Exp( a );
1554 aSign = extractFloat32Sign( a );
1555 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1556 if ( aExp ) aSig |= 0x00800000;
1557 shiftCount = 0xAF - aExp;
1560 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1561 return roundAndPackInt32(aSign, aSig64, status);
1565 /*----------------------------------------------------------------------------
1566 | Returns the result of converting the single-precision floating-point value
1567 | `a' to the 32-bit two's complement integer format. The conversion is
1568 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1569 | Arithmetic, except that the conversion is always rounded toward zero.
1570 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1571 | the conversion overflows, the largest integer with the same sign as `a' is
1573 *----------------------------------------------------------------------------*/
1575 int32 float32_to_int32_round_to_zero(float32 a, float_status *status)
1578 int_fast16_t aExp, shiftCount;
1581 a = float32_squash_input_denormal(a, status);
1583 aSig = extractFloat32Frac( a );
1584 aExp = extractFloat32Exp( a );
1585 aSign = extractFloat32Sign( a );
1586 shiftCount = aExp - 0x9E;
1587 if ( 0 <= shiftCount ) {
1588 if ( float32_val(a) != 0xCF000000 ) {
1589 float_raise(float_flag_invalid, status);
1590 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1592 return (int32_t) 0x80000000;
1594 else if ( aExp <= 0x7E ) {
1596 status->float_exception_flags |= float_flag_inexact;
1600 aSig = ( aSig | 0x00800000 )<<8;
1601 z = aSig>>( - shiftCount );
1602 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1603 status->float_exception_flags |= float_flag_inexact;
1605 if ( aSign ) z = - z;
1610 /*----------------------------------------------------------------------------
1611 | Returns the result of converting the single-precision floating-point value
1612 | `a' to the 16-bit two's complement integer format. The conversion is
1613 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1614 | Arithmetic, except that the conversion is always rounded toward zero.
1615 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1616 | the conversion overflows, the largest integer with the same sign as `a' is
1618 *----------------------------------------------------------------------------*/
1620 int_fast16_t float32_to_int16_round_to_zero(float32 a, float_status *status)
1623 int_fast16_t aExp, shiftCount;
1627 aSig = extractFloat32Frac( a );
1628 aExp = extractFloat32Exp( a );
1629 aSign = extractFloat32Sign( a );
1630 shiftCount = aExp - 0x8E;
1631 if ( 0 <= shiftCount ) {
1632 if ( float32_val(a) != 0xC7000000 ) {
1633 float_raise(float_flag_invalid, status);
1634 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1638 return (int32_t) 0xffff8000;
1640 else if ( aExp <= 0x7E ) {
1641 if ( aExp | aSig ) {
1642 status->float_exception_flags |= float_flag_inexact;
1647 aSig = ( aSig | 0x00800000 )<<8;
1648 z = aSig>>( - shiftCount );
1649 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
1650 status->float_exception_flags |= float_flag_inexact;
1659 /*----------------------------------------------------------------------------
1660 | Returns the result of converting the single-precision floating-point value
1661 | `a' to the 64-bit two's complement integer format. The conversion is
1662 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1663 | Arithmetic---which means in particular that the conversion is rounded
1664 | according to the current rounding mode. If `a' is a NaN, the largest
1665 | positive integer is returned. Otherwise, if the conversion overflows, the
1666 | largest integer with the same sign as `a' is returned.
1667 *----------------------------------------------------------------------------*/
1669 int64 float32_to_int64(float32 a, float_status *status)
1672 int_fast16_t aExp, shiftCount;
1674 uint64_t aSig64, aSigExtra;
1675 a = float32_squash_input_denormal(a, status);
1677 aSig = extractFloat32Frac( a );
1678 aExp = extractFloat32Exp( a );
1679 aSign = extractFloat32Sign( a );
1680 shiftCount = 0xBE - aExp;
1681 if ( shiftCount < 0 ) {
1682 float_raise(float_flag_invalid, status);
1683 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1684 return LIT64( 0x7FFFFFFFFFFFFFFF );
1686 return (int64_t) LIT64( 0x8000000000000000 );
1688 if ( aExp ) aSig |= 0x00800000;
1691 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1692 return roundAndPackInt64(aSign, aSig64, aSigExtra, status);
1696 /*----------------------------------------------------------------------------
1697 | Returns the result of converting the single-precision floating-point value
1698 | `a' to the 64-bit unsigned integer format. The conversion is
1699 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1700 | Arithmetic---which means in particular that the conversion is rounded
1701 | according to the current rounding mode. If `a' is a NaN, the largest
1702 | unsigned integer is returned. Otherwise, if the conversion overflows, the
1703 | largest unsigned integer is returned. If the 'a' is negative, the result
1704 | is rounded and zero is returned; values that do not round to zero will
1705 | raise the inexact exception flag.
1706 *----------------------------------------------------------------------------*/
1708 uint64 float32_to_uint64(float32 a, float_status *status)
1711 int_fast16_t aExp, shiftCount;
1713 uint64_t aSig64, aSigExtra;
1714 a = float32_squash_input_denormal(a, status);
1716 aSig = extractFloat32Frac(a);
1717 aExp = extractFloat32Exp(a);
1718 aSign = extractFloat32Sign(a);
1719 if ((aSign) && (aExp > 126)) {
1720 float_raise(float_flag_invalid, status);
1721 if (float32_is_any_nan(a)) {
1722 return LIT64(0xFFFFFFFFFFFFFFFF);
1727 shiftCount = 0xBE - aExp;
1731 if (shiftCount < 0) {
1732 float_raise(float_flag_invalid, status);
1733 return LIT64(0xFFFFFFFFFFFFFFFF);
1738 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1739 return roundAndPackUint64(aSign, aSig64, aSigExtra, status);
1742 /*----------------------------------------------------------------------------
1743 | Returns the result of converting the single-precision floating-point value
1744 | `a' to the 64-bit unsigned integer format. The conversion is
1745 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1746 | Arithmetic, except that the conversion is always rounded toward zero. If
1747 | `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1748 | conversion overflows, the largest unsigned integer is returned. If the
1749 | 'a' is negative, the result is rounded and zero is returned; values that do
1750 | not round to zero will raise the inexact flag.
1751 *----------------------------------------------------------------------------*/
1753 uint64 float32_to_uint64_round_to_zero(float32 a, float_status *status)
1755 signed char current_rounding_mode = status->float_rounding_mode;
1756 set_float_rounding_mode(float_round_to_zero, status);
1757 int64_t v = float32_to_uint64(a, status);
1758 set_float_rounding_mode(current_rounding_mode, status);
1762 /*----------------------------------------------------------------------------
1763 | Returns the result of converting the single-precision floating-point value
1764 | `a' to the 64-bit two's complement integer format. The conversion is
1765 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1766 | Arithmetic, except that the conversion is always rounded toward zero. If
1767 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1768 | conversion overflows, the largest integer with the same sign as `a' is
1770 *----------------------------------------------------------------------------*/
1772 int64 float32_to_int64_round_to_zero(float32 a, float_status *status)
1775 int_fast16_t aExp, shiftCount;
1779 a = float32_squash_input_denormal(a, status);
1781 aSig = extractFloat32Frac( a );
1782 aExp = extractFloat32Exp( a );
1783 aSign = extractFloat32Sign( a );
1784 shiftCount = aExp - 0xBE;
1785 if ( 0 <= shiftCount ) {
1786 if ( float32_val(a) != 0xDF000000 ) {
1787 float_raise(float_flag_invalid, status);
1788 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1789 return LIT64( 0x7FFFFFFFFFFFFFFF );
1792 return (int64_t) LIT64( 0x8000000000000000 );
1794 else if ( aExp <= 0x7E ) {
1796 status->float_exception_flags |= float_flag_inexact;
1800 aSig64 = aSig | 0x00800000;
1802 z = aSig64>>( - shiftCount );
1803 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
1804 status->float_exception_flags |= float_flag_inexact;
1806 if ( aSign ) z = - z;
1811 /*----------------------------------------------------------------------------
1812 | Returns the result of converting the single-precision floating-point value
1813 | `a' to the double-precision floating-point format. The conversion is
1814 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1816 *----------------------------------------------------------------------------*/
1818 float64 float32_to_float64(float32 a, float_status *status)
1823 a = float32_squash_input_denormal(a, status);
1825 aSig = extractFloat32Frac( a );
1826 aExp = extractFloat32Exp( a );
1827 aSign = extractFloat32Sign( a );
1828 if ( aExp == 0xFF ) {
1830 return commonNaNToFloat64(float32ToCommonNaN(a, status), status);
1832 return packFloat64( aSign, 0x7FF, 0 );
1835 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1836 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1839 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
1843 /*----------------------------------------------------------------------------
1844 | Returns the result of converting the single-precision floating-point value
1845 | `a' to the extended double-precision floating-point format. The conversion
1846 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
1848 *----------------------------------------------------------------------------*/
1850 floatx80 float32_to_floatx80(float32 a, float_status *status)
1856 a = float32_squash_input_denormal(a, status);
1857 aSig = extractFloat32Frac( a );
1858 aExp = extractFloat32Exp( a );
1859 aSign = extractFloat32Sign( a );
1860 if ( aExp == 0xFF ) {
1862 return commonNaNToFloatx80(float32ToCommonNaN(a, status), status);
1864 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1867 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1868 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1871 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
1875 /*----------------------------------------------------------------------------
1876 | Returns the result of converting the single-precision floating-point value
1877 | `a' to the double-precision floating-point format. The conversion is
1878 | performed according to the IEC/IEEE Standard for Binary Floating-Point
1880 *----------------------------------------------------------------------------*/
1882 float128 float32_to_float128(float32 a, float_status *status)
1888 a = float32_squash_input_denormal(a, status);
1889 aSig = extractFloat32Frac( a );
1890 aExp = extractFloat32Exp( a );
1891 aSign = extractFloat32Sign( a );
1892 if ( aExp == 0xFF ) {
1894 return commonNaNToFloat128(float32ToCommonNaN(a, status), status);
1896 return packFloat128( aSign, 0x7FFF, 0, 0 );
1899 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1900 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1903 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
1907 /*----------------------------------------------------------------------------
1908 | Rounds the single-precision floating-point value `a' to an integer, and
1909 | returns the result as a single-precision floating-point value. The
1910 | operation is performed according to the IEC/IEEE Standard for Binary
1911 | Floating-Point Arithmetic.
1912 *----------------------------------------------------------------------------*/
1914 float32 float32_round_to_int(float32 a, float_status *status)
1918 uint32_t lastBitMask, roundBitsMask;
1920 a = float32_squash_input_denormal(a, status);
1922 aExp = extractFloat32Exp( a );
1923 if ( 0x96 <= aExp ) {
1924 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1925 return propagateFloat32NaN(a, a, status);
1929 if ( aExp <= 0x7E ) {
1930 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
1931 status->float_exception_flags |= float_flag_inexact;
1932 aSign = extractFloat32Sign( a );
1933 switch (status->float_rounding_mode) {
1934 case float_round_nearest_even:
1935 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1936 return packFloat32( aSign, 0x7F, 0 );
1939 case float_round_ties_away:
1941 return packFloat32(aSign, 0x7F, 0);
1944 case float_round_down:
1945 return make_float32(aSign ? 0xBF800000 : 0);
1946 case float_round_up:
1947 return make_float32(aSign ? 0x80000000 : 0x3F800000);
1949 return packFloat32( aSign, 0, 0 );
1952 lastBitMask <<= 0x96 - aExp;
1953 roundBitsMask = lastBitMask - 1;
1955 switch (status->float_rounding_mode) {
1956 case float_round_nearest_even:
1957 z += lastBitMask>>1;
1958 if ((z & roundBitsMask) == 0) {
1962 case float_round_ties_away:
1963 z += lastBitMask >> 1;
1965 case float_round_to_zero:
1967 case float_round_up:
1968 if (!extractFloat32Sign(make_float32(z))) {
1972 case float_round_down:
1973 if (extractFloat32Sign(make_float32(z))) {
1980 z &= ~ roundBitsMask;
1981 if (z != float32_val(a)) {
1982 status->float_exception_flags |= float_flag_inexact;
1984 return make_float32(z);
1988 /*----------------------------------------------------------------------------
1989 | Returns the result of adding the absolute values of the single-precision
1990 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1991 | before being returned. `zSign' is ignored if the result is a NaN.
1992 | The addition is performed according to the IEC/IEEE Standard for Binary
1993 | Floating-Point Arithmetic.
1994 *----------------------------------------------------------------------------*/
1996 static float32 addFloat32Sigs(float32 a, float32 b, flag zSign,
1997 float_status *status)
1999 int_fast16_t aExp, bExp, zExp;
2000 uint32_t aSig, bSig, zSig;
2001 int_fast16_t expDiff;
2003 aSig = extractFloat32Frac( a );
2004 aExp = extractFloat32Exp( a );
2005 bSig = extractFloat32Frac( b );
2006 bExp = extractFloat32Exp( b );
2007 expDiff = aExp - bExp;
2010 if ( 0 < expDiff ) {
2011 if ( aExp == 0xFF ) {
2013 return propagateFloat32NaN(a, b, status);
2023 shift32RightJamming( bSig, expDiff, &bSig );
2026 else if ( expDiff < 0 ) {
2027 if ( bExp == 0xFF ) {
2029 return propagateFloat32NaN(a, b, status);
2031 return packFloat32( zSign, 0xFF, 0 );
2039 shift32RightJamming( aSig, - expDiff, &aSig );
2043 if ( aExp == 0xFF ) {
2045 return propagateFloat32NaN(a, b, status);
2050 if (status->flush_to_zero) {
2052 float_raise(float_flag_output_denormal, status);
2054 return packFloat32(zSign, 0, 0);
2056 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2058 zSig = 0x40000000 + aSig + bSig;
2063 zSig = ( aSig + bSig )<<1;
2065 if ( (int32_t) zSig < 0 ) {
2070 return roundAndPackFloat32(zSign, zExp, zSig, status);
2074 /*----------------------------------------------------------------------------
2075 | Returns the result of subtracting the absolute values of the single-
2076 | precision floating-point values `a' and `b'. If `zSign' is 1, the
2077 | difference is negated before being returned. `zSign' is ignored if the
2078 | result is a NaN. The subtraction is performed according to the IEC/IEEE
2079 | Standard for Binary Floating-Point Arithmetic.
2080 *----------------------------------------------------------------------------*/
2082 static float32 subFloat32Sigs(float32 a, float32 b, flag zSign,
2083 float_status *status)
2085 int_fast16_t aExp, bExp, zExp;
2086 uint32_t aSig, bSig, zSig;
2087 int_fast16_t expDiff;
2089 aSig = extractFloat32Frac( a );
2090 aExp = extractFloat32Exp( a );
2091 bSig = extractFloat32Frac( b );
2092 bExp = extractFloat32Exp( b );
2093 expDiff = aExp - bExp;
2096 if ( 0 < expDiff ) goto aExpBigger;
2097 if ( expDiff < 0 ) goto bExpBigger;
2098 if ( aExp == 0xFF ) {
2100 return propagateFloat32NaN(a, b, status);
2102 float_raise(float_flag_invalid, status);
2103 return float32_default_nan;
2109 if ( bSig < aSig ) goto aBigger;
2110 if ( aSig < bSig ) goto bBigger;
2111 return packFloat32(status->float_rounding_mode == float_round_down, 0, 0);
2113 if ( bExp == 0xFF ) {
2115 return propagateFloat32NaN(a, b, status);
2117 return packFloat32( zSign ^ 1, 0xFF, 0 );
2125 shift32RightJamming( aSig, - expDiff, &aSig );
2131 goto normalizeRoundAndPack;
2133 if ( aExp == 0xFF ) {
2135 return propagateFloat32NaN(a, b, status);
2145 shift32RightJamming( bSig, expDiff, &bSig );
2150 normalizeRoundAndPack:
2152 return normalizeRoundAndPackFloat32(zSign, zExp, zSig, status);
2156 /*----------------------------------------------------------------------------
2157 | Returns the result of adding the single-precision floating-point values `a'
2158 | and `b'. The operation is performed according to the IEC/IEEE Standard for
2159 | Binary Floating-Point Arithmetic.
2160 *----------------------------------------------------------------------------*/
2162 float32 float32_add(float32 a, float32 b, float_status *status)
2165 a = float32_squash_input_denormal(a, status);
2166 b = float32_squash_input_denormal(b, status);
2168 aSign = extractFloat32Sign( a );
2169 bSign = extractFloat32Sign( b );
2170 if ( aSign == bSign ) {
2171 return addFloat32Sigs(a, b, aSign, status);
2174 return subFloat32Sigs(a, b, aSign, status);
2179 /*----------------------------------------------------------------------------
2180 | Returns the result of subtracting the single-precision floating-point values
2181 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2182 | for Binary Floating-Point Arithmetic.
2183 *----------------------------------------------------------------------------*/
2185 float32 float32_sub(float32 a, float32 b, float_status *status)
2188 a = float32_squash_input_denormal(a, status);
2189 b = float32_squash_input_denormal(b, status);
2191 aSign = extractFloat32Sign( a );
2192 bSign = extractFloat32Sign( b );
2193 if ( aSign == bSign ) {
2194 return subFloat32Sigs(a, b, aSign, status);
2197 return addFloat32Sigs(a, b, aSign, status);
2202 /*----------------------------------------------------------------------------
2203 | Returns the result of multiplying the single-precision floating-point values
2204 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2205 | for Binary Floating-Point Arithmetic.
2206 *----------------------------------------------------------------------------*/
2208 float32 float32_mul(float32 a, float32 b, float_status *status)
2210 flag aSign, bSign, zSign;
2211 int_fast16_t aExp, bExp, zExp;
2212 uint32_t aSig, bSig;
2216 a = float32_squash_input_denormal(a, status);
2217 b = float32_squash_input_denormal(b, status);
2219 aSig = extractFloat32Frac( a );
2220 aExp = extractFloat32Exp( a );
2221 aSign = extractFloat32Sign( a );
2222 bSig = extractFloat32Frac( b );
2223 bExp = extractFloat32Exp( b );
2224 bSign = extractFloat32Sign( b );
2225 zSign = aSign ^ bSign;
2226 if ( aExp == 0xFF ) {
2227 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2228 return propagateFloat32NaN(a, b, status);
2230 if ( ( bExp | bSig ) == 0 ) {
2231 float_raise(float_flag_invalid, status);
2232 return float32_default_nan;
2234 return packFloat32( zSign, 0xFF, 0 );
2236 if ( bExp == 0xFF ) {
2238 return propagateFloat32NaN(a, b, status);
2240 if ( ( aExp | aSig ) == 0 ) {
2241 float_raise(float_flag_invalid, status);
2242 return float32_default_nan;
2244 return packFloat32( zSign, 0xFF, 0 );
2247 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2248 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2251 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2252 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2254 zExp = aExp + bExp - 0x7F;
2255 aSig = ( aSig | 0x00800000 )<<7;
2256 bSig = ( bSig | 0x00800000 )<<8;
2257 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
2259 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
2263 return roundAndPackFloat32(zSign, zExp, zSig, status);
2267 /*----------------------------------------------------------------------------
2268 | Returns the result of dividing the single-precision floating-point value `a'
2269 | by the corresponding value `b'. The operation is performed according to the
2270 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2271 *----------------------------------------------------------------------------*/
2273 float32 float32_div(float32 a, float32 b, float_status *status)
2275 flag aSign, bSign, zSign;
2276 int_fast16_t aExp, bExp, zExp;
2277 uint32_t aSig, bSig, zSig;
2278 a = float32_squash_input_denormal(a, status);
2279 b = float32_squash_input_denormal(b, status);
2281 aSig = extractFloat32Frac( a );
2282 aExp = extractFloat32Exp( a );
2283 aSign = extractFloat32Sign( a );
2284 bSig = extractFloat32Frac( b );
2285 bExp = extractFloat32Exp( b );
2286 bSign = extractFloat32Sign( b );
2287 zSign = aSign ^ bSign;
2288 if ( aExp == 0xFF ) {
2290 return propagateFloat32NaN(a, b, status);
2292 if ( bExp == 0xFF ) {
2294 return propagateFloat32NaN(a, b, status);
2296 float_raise(float_flag_invalid, status);
2297 return float32_default_nan;
2299 return packFloat32( zSign, 0xFF, 0 );
2301 if ( bExp == 0xFF ) {
2303 return propagateFloat32NaN(a, b, status);
2305 return packFloat32( zSign, 0, 0 );
2309 if ( ( aExp | aSig ) == 0 ) {
2310 float_raise(float_flag_invalid, status);
2311 return float32_default_nan;
2313 float_raise(float_flag_divbyzero, status);
2314 return packFloat32( zSign, 0xFF, 0 );
2316 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2319 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2320 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2322 zExp = aExp - bExp + 0x7D;
2323 aSig = ( aSig | 0x00800000 )<<7;
2324 bSig = ( bSig | 0x00800000 )<<8;
2325 if ( bSig <= ( aSig + aSig ) ) {
2329 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
2330 if ( ( zSig & 0x3F ) == 0 ) {
2331 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
2333 return roundAndPackFloat32(zSign, zExp, zSig, status);
2337 /*----------------------------------------------------------------------------
2338 | Returns the remainder of the single-precision floating-point value `a'
2339 | with respect to the corresponding value `b'. The operation is performed
2340 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2341 *----------------------------------------------------------------------------*/
2343 float32 float32_rem(float32 a, float32 b, float_status *status)
2346 int_fast16_t aExp, bExp, expDiff;
2347 uint32_t aSig, bSig;
2349 uint64_t aSig64, bSig64, q64;
2350 uint32_t alternateASig;
2352 a = float32_squash_input_denormal(a, status);
2353 b = float32_squash_input_denormal(b, status);
2355 aSig = extractFloat32Frac( a );
2356 aExp = extractFloat32Exp( a );
2357 aSign = extractFloat32Sign( a );
2358 bSig = extractFloat32Frac( b );
2359 bExp = extractFloat32Exp( b );
2360 if ( aExp == 0xFF ) {
2361 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2362 return propagateFloat32NaN(a, b, status);
2364 float_raise(float_flag_invalid, status);
2365 return float32_default_nan;
2367 if ( bExp == 0xFF ) {
2369 return propagateFloat32NaN(a, b, status);
2375 float_raise(float_flag_invalid, status);
2376 return float32_default_nan;
2378 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2381 if ( aSig == 0 ) return a;
2382 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2384 expDiff = aExp - bExp;
2387 if ( expDiff < 32 ) {
2390 if ( expDiff < 0 ) {
2391 if ( expDiff < -1 ) return a;
2394 q = ( bSig <= aSig );
2395 if ( q ) aSig -= bSig;
2396 if ( 0 < expDiff ) {
2397 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
2400 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2408 if ( bSig <= aSig ) aSig -= bSig;
2409 aSig64 = ( (uint64_t) aSig )<<40;
2410 bSig64 = ( (uint64_t) bSig )<<40;
2412 while ( 0 < expDiff ) {
2413 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2414 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2415 aSig64 = - ( ( bSig * q64 )<<38 );
2419 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2420 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2421 q = q64>>( 64 - expDiff );
2423 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2426 alternateASig = aSig;
2429 } while ( 0 <= (int32_t) aSig );
2430 sigMean = aSig + alternateASig;
2431 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2432 aSig = alternateASig;
2434 zSign = ( (int32_t) aSig < 0 );
2435 if ( zSign ) aSig = - aSig;
2436 return normalizeRoundAndPackFloat32(aSign ^ zSign, bExp, aSig, status);
2439 /*----------------------------------------------------------------------------
2440 | Returns the result of multiplying the single-precision floating-point values
2441 | `a' and `b' then adding 'c', with no intermediate rounding step after the
2442 | multiplication. The operation is performed according to the IEC/IEEE
2443 | Standard for Binary Floating-Point Arithmetic 754-2008.
2444 | The flags argument allows the caller to select negation of the
2445 | addend, the intermediate product, or the final result. (The difference
2446 | between this and having the caller do a separate negation is that negating
2447 | externally will flip the sign bit on NaNs.)
2448 *----------------------------------------------------------------------------*/
2450 float32 float32_muladd(float32 a, float32 b, float32 c, int flags,
2451 float_status *status)
2453 flag aSign, bSign, cSign, zSign;
2454 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
2455 uint32_t aSig, bSig, cSig;
2456 flag pInf, pZero, pSign;
2457 uint64_t pSig64, cSig64, zSig64;
2460 flag signflip, infzero;
2462 a = float32_squash_input_denormal(a, status);
2463 b = float32_squash_input_denormal(b, status);
2464 c = float32_squash_input_denormal(c, status);
2465 aSig = extractFloat32Frac(a);
2466 aExp = extractFloat32Exp(a);
2467 aSign = extractFloat32Sign(a);
2468 bSig = extractFloat32Frac(b);
2469 bExp = extractFloat32Exp(b);
2470 bSign = extractFloat32Sign(b);
2471 cSig = extractFloat32Frac(c);
2472 cExp = extractFloat32Exp(c);
2473 cSign = extractFloat32Sign(c);
2475 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2476 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2478 /* It is implementation-defined whether the cases of (0,inf,qnan)
2479 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2480 * they return if they do), so we have to hand this information
2481 * off to the target-specific pick-a-NaN routine.
2483 if (((aExp == 0xff) && aSig) ||
2484 ((bExp == 0xff) && bSig) ||
2485 ((cExp == 0xff) && cSig)) {
2486 return propagateFloat32MulAddNaN(a, b, c, infzero, status);
2490 float_raise(float_flag_invalid, status);
2491 return float32_default_nan;
2494 if (flags & float_muladd_negate_c) {
2498 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2500 /* Work out the sign and type of the product */
2501 pSign = aSign ^ bSign;
2502 if (flags & float_muladd_negate_product) {
2505 pInf = (aExp == 0xff) || (bExp == 0xff);
2506 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2509 if (pInf && (pSign ^ cSign)) {
2510 /* addition of opposite-signed infinities => InvalidOperation */
2511 float_raise(float_flag_invalid, status);
2512 return float32_default_nan;
2514 /* Otherwise generate an infinity of the same sign */
2515 return packFloat32(cSign ^ signflip, 0xff, 0);
2519 return packFloat32(pSign ^ signflip, 0xff, 0);
2525 /* Adding two exact zeroes */
2526 if (pSign == cSign) {
2528 } else if (status->float_rounding_mode == float_round_down) {
2533 return packFloat32(zSign ^ signflip, 0, 0);
2535 /* Exact zero plus a denorm */
2536 if (status->flush_to_zero) {
2537 float_raise(float_flag_output_denormal, status);
2538 return packFloat32(cSign ^ signflip, 0, 0);
2541 /* Zero plus something non-zero : just return the something */
2542 if (flags & float_muladd_halve_result) {
2544 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2546 /* Subtract one to halve, and one again because roundAndPackFloat32
2547 * wants one less than the true exponent.
2550 cSig = (cSig | 0x00800000) << 7;
2551 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig, status);
2553 return packFloat32(cSign ^ signflip, cExp, cSig);
2557 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2560 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2563 /* Calculate the actual result a * b + c */
2565 /* Multiply first; this is easy. */
2566 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2567 * because we want the true exponent, not the "one-less-than"
2568 * flavour that roundAndPackFloat32() takes.
2570 pExp = aExp + bExp - 0x7e;
2571 aSig = (aSig | 0x00800000) << 7;
2572 bSig = (bSig | 0x00800000) << 8;
2573 pSig64 = (uint64_t)aSig * bSig;
2574 if ((int64_t)(pSig64 << 1) >= 0) {
2579 zSign = pSign ^ signflip;
2581 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2586 /* Throw out the special case of c being an exact zero now */
2587 shift64RightJamming(pSig64, 32, &pSig64);
2589 if (flags & float_muladd_halve_result) {
2592 return roundAndPackFloat32(zSign, pExp - 1,
2595 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2598 cSig64 = (uint64_t)cSig << (62 - 23);
2599 cSig64 |= LIT64(0x4000000000000000);
2600 expDiff = pExp - cExp;
2602 if (pSign == cSign) {
2605 /* scale c to match p */
2606 shift64RightJamming(cSig64, expDiff, &cSig64);
2608 } else if (expDiff < 0) {
2609 /* scale p to match c */
2610 shift64RightJamming(pSig64, -expDiff, &pSig64);
2613 /* no scaling needed */
2616 /* Add significands and make sure explicit bit ends up in posn 62 */
2617 zSig64 = pSig64 + cSig64;
2618 if ((int64_t)zSig64 < 0) {
2619 shift64RightJamming(zSig64, 1, &zSig64);
2626 shift64RightJamming(cSig64, expDiff, &cSig64);
2627 zSig64 = pSig64 - cSig64;
2629 } else if (expDiff < 0) {
2630 shift64RightJamming(pSig64, -expDiff, &pSig64);
2631 zSig64 = cSig64 - pSig64;
2636 if (cSig64 < pSig64) {
2637 zSig64 = pSig64 - cSig64;
2638 } else if (pSig64 < cSig64) {
2639 zSig64 = cSig64 - pSig64;
2644 if (status->float_rounding_mode == float_round_down) {
2647 return packFloat32(zSign, 0, 0);
2651 /* Normalize to put the explicit bit back into bit 62. */
2652 shiftcount = countLeadingZeros64(zSig64) - 1;
2653 zSig64 <<= shiftcount;
2656 if (flags & float_muladd_halve_result) {
2660 shift64RightJamming(zSig64, 32, &zSig64);
2661 return roundAndPackFloat32(zSign, zExp, zSig64, status);
2665 /*----------------------------------------------------------------------------
2666 | Returns the square root of the single-precision floating-point value `a'.
2667 | The operation is performed according to the IEC/IEEE Standard for Binary
2668 | Floating-Point Arithmetic.
2669 *----------------------------------------------------------------------------*/
2671 float32 float32_sqrt(float32 a, float_status *status)
2674 int_fast16_t aExp, zExp;
2675 uint32_t aSig, zSig;
2677 a = float32_squash_input_denormal(a, status);
2679 aSig = extractFloat32Frac( a );
2680 aExp = extractFloat32Exp( a );
2681 aSign = extractFloat32Sign( a );
2682 if ( aExp == 0xFF ) {
2684 return propagateFloat32NaN(a, float32_zero, status);
2686 if ( ! aSign ) return a;
2687 float_raise(float_flag_invalid, status);
2688 return float32_default_nan;
2691 if ( ( aExp | aSig ) == 0 ) return a;
2692 float_raise(float_flag_invalid, status);
2693 return float32_default_nan;
2696 if ( aSig == 0 ) return float32_zero;
2697 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2699 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2700 aSig = ( aSig | 0x00800000 )<<8;
2701 zSig = estimateSqrt32( aExp, aSig ) + 2;
2702 if ( ( zSig & 0x7F ) <= 5 ) {
2708 term = ( (uint64_t) zSig ) * zSig;
2709 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2710 while ( (int64_t) rem < 0 ) {
2712 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
2714 zSig |= ( rem != 0 );
2716 shift32RightJamming( zSig, 1, &zSig );
2718 return roundAndPackFloat32(0, zExp, zSig, status);
2722 /*----------------------------------------------------------------------------
2723 | Returns the binary exponential of the single-precision floating-point value
2724 | `a'. The operation is performed according to the IEC/IEEE Standard for
2725 | Binary Floating-Point Arithmetic.
2727 | Uses the following identities:
2729 | 1. -------------------------------------------------------------------------
2733 | 2. -------------------------------------------------------------------------
2736 | e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2738 *----------------------------------------------------------------------------*/
2740 static const float64 float32_exp2_coefficients[15] =
2742 const_float64( 0x3ff0000000000000ll ), /* 1 */
2743 const_float64( 0x3fe0000000000000ll ), /* 2 */
2744 const_float64( 0x3fc5555555555555ll ), /* 3 */
2745 const_float64( 0x3fa5555555555555ll ), /* 4 */
2746 const_float64( 0x3f81111111111111ll ), /* 5 */
2747 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2748 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2749 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2750 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2751 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2752 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2753 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2754 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2755 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2756 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
2759 float32 float32_exp2(float32 a, float_status *status)
2766 a = float32_squash_input_denormal(a, status);
2768 aSig = extractFloat32Frac( a );
2769 aExp = extractFloat32Exp( a );
2770 aSign = extractFloat32Sign( a );
2772 if ( aExp == 0xFF) {
2774 return propagateFloat32NaN(a, float32_zero, status);
2776 return (aSign) ? float32_zero : a;
2779 if (aSig == 0) return float32_one;
2782 float_raise(float_flag_inexact, status);
2784 /* ******************************* */
2785 /* using float64 for approximation */
2786 /* ******************************* */
2787 x = float32_to_float64(a, status);
2788 x = float64_mul(x, float64_ln2, status);
2792 for (i = 0 ; i < 15 ; i++) {
2795 f = float64_mul(xn, float32_exp2_coefficients[i], status);
2796 r = float64_add(r, f, status);
2798 xn = float64_mul(xn, x, status);
2801 return float64_to_float32(r, status);
2804 /*----------------------------------------------------------------------------
2805 | Returns the binary log of the single-precision floating-point value `a'.
2806 | The operation is performed according to the IEC/IEEE Standard for Binary
2807 | Floating-Point Arithmetic.
2808 *----------------------------------------------------------------------------*/
2809 float32 float32_log2(float32 a, float_status *status)
2813 uint32_t aSig, zSig, i;
2815 a = float32_squash_input_denormal(a, status);
2816 aSig = extractFloat32Frac( a );
2817 aExp = extractFloat32Exp( a );
2818 aSign = extractFloat32Sign( a );
2821 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2822 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2825 float_raise(float_flag_invalid, status);
2826 return float32_default_nan;
2828 if ( aExp == 0xFF ) {
2830 return propagateFloat32NaN(a, float32_zero, status);
2840 for (i = 1 << 22; i > 0; i >>= 1) {
2841 aSig = ( (uint64_t)aSig * aSig ) >> 23;
2842 if ( aSig & 0x01000000 ) {
2851 return normalizeRoundAndPackFloat32(zSign, 0x85, zSig, status);
2854 /*----------------------------------------------------------------------------
2855 | Returns 1 if the single-precision floating-point value `a' is equal to
2856 | the corresponding value `b', and 0 otherwise. The invalid exception is
2857 | raised if either operand is a NaN. Otherwise, the comparison is performed
2858 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2859 *----------------------------------------------------------------------------*/
2861 int float32_eq(float32 a, float32 b, float_status *status)
2864 a = float32_squash_input_denormal(a, status);
2865 b = float32_squash_input_denormal(b, status);
2867 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2868 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2870 float_raise(float_flag_invalid, status);
2873 av = float32_val(a);
2874 bv = float32_val(b);
2875 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2878 /*----------------------------------------------------------------------------
2879 | Returns 1 if the single-precision floating-point value `a' is less than
2880 | or equal to the corresponding value `b', and 0 otherwise. The invalid
2881 | exception is raised if either operand is a NaN. The comparison is performed
2882 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2883 *----------------------------------------------------------------------------*/
2885 int float32_le(float32 a, float32 b, float_status *status)
2889 a = float32_squash_input_denormal(a, status);
2890 b = float32_squash_input_denormal(b, status);
2892 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2893 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2895 float_raise(float_flag_invalid, status);
2898 aSign = extractFloat32Sign( a );
2899 bSign = extractFloat32Sign( b );
2900 av = float32_val(a);
2901 bv = float32_val(b);
2902 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
2903 return ( av == bv ) || ( aSign ^ ( av < bv ) );
2907 /*----------------------------------------------------------------------------
2908 | Returns 1 if the single-precision floating-point value `a' is less than
2909 | the corresponding value `b', and 0 otherwise. The invalid exception is
2910 | raised if either operand is a NaN. The comparison is performed according
2911 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2912 *----------------------------------------------------------------------------*/
2914 int float32_lt(float32 a, float32 b, float_status *status)
2918 a = float32_squash_input_denormal(a, status);
2919 b = float32_squash_input_denormal(b, status);
2921 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2922 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2924 float_raise(float_flag_invalid, status);
2927 aSign = extractFloat32Sign( a );
2928 bSign = extractFloat32Sign( b );
2929 av = float32_val(a);
2930 bv = float32_val(b);
2931 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
2932 return ( av != bv ) && ( aSign ^ ( av < bv ) );
2936 /*----------------------------------------------------------------------------
2937 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
2938 | be compared, and 0 otherwise. The invalid exception is raised if either
2939 | operand is a NaN. The comparison is performed according to the IEC/IEEE
2940 | Standard for Binary Floating-Point Arithmetic.
2941 *----------------------------------------------------------------------------*/
2943 int float32_unordered(float32 a, float32 b, float_status *status)
2945 a = float32_squash_input_denormal(a, status);
2946 b = float32_squash_input_denormal(b, status);
2948 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2949 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2951 float_raise(float_flag_invalid, status);
2957 /*----------------------------------------------------------------------------
2958 | Returns 1 if the single-precision floating-point value `a' is equal to
2959 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2960 | exception. The comparison is performed according to the IEC/IEEE Standard
2961 | for Binary Floating-Point Arithmetic.
2962 *----------------------------------------------------------------------------*/
2964 int float32_eq_quiet(float32 a, float32 b, float_status *status)
2966 a = float32_squash_input_denormal(a, status);
2967 b = float32_squash_input_denormal(b, status);
2969 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2970 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2972 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2973 float_raise(float_flag_invalid, status);
2977 return ( float32_val(a) == float32_val(b) ) ||
2978 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
2981 /*----------------------------------------------------------------------------
2982 | Returns 1 if the single-precision floating-point value `a' is less than or
2983 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2984 | cause an exception. Otherwise, the comparison is performed according to the
2985 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2986 *----------------------------------------------------------------------------*/
2988 int float32_le_quiet(float32 a, float32 b, float_status *status)
2992 a = float32_squash_input_denormal(a, status);
2993 b = float32_squash_input_denormal(b, status);
2995 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2996 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2998 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2999 float_raise(float_flag_invalid, status);
3003 aSign = extractFloat32Sign( a );
3004 bSign = extractFloat32Sign( b );
3005 av = float32_val(a);
3006 bv = float32_val(b);
3007 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
3008 return ( av == bv ) || ( aSign ^ ( av < bv ) );
3012 /*----------------------------------------------------------------------------
3013 | Returns 1 if the single-precision floating-point value `a' is less than
3014 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
3015 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
3016 | Standard for Binary Floating-Point Arithmetic.
3017 *----------------------------------------------------------------------------*/
3019 int float32_lt_quiet(float32 a, float32 b, float_status *status)
3023 a = float32_squash_input_denormal(a, status);
3024 b = float32_squash_input_denormal(b, status);
3026 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3027 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3029 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3030 float_raise(float_flag_invalid, status);
3034 aSign = extractFloat32Sign( a );
3035 bSign = extractFloat32Sign( b );
3036 av = float32_val(a);
3037 bv = float32_val(b);
3038 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
3039 return ( av != bv ) && ( aSign ^ ( av < bv ) );
3043 /*----------------------------------------------------------------------------
3044 | Returns 1 if the single-precision floating-point values `a' and `b' cannot
3045 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
3046 | comparison is performed according to the IEC/IEEE Standard for Binary
3047 | Floating-Point Arithmetic.
3048 *----------------------------------------------------------------------------*/
3050 int float32_unordered_quiet(float32 a, float32 b, float_status *status)
3052 a = float32_squash_input_denormal(a, status);
3053 b = float32_squash_input_denormal(b, status);
3055 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
3056 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
3058 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
3059 float_raise(float_flag_invalid, status);
3066 /*----------------------------------------------------------------------------
3067 | Returns the result of converting the double-precision floating-point value
3068 | `a' to the 32-bit two's complement integer format. The conversion is
3069 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3070 | Arithmetic---which means in particular that the conversion is rounded
3071 | according to the current rounding mode. If `a' is a NaN, the largest
3072 | positive integer is returned. Otherwise, if the conversion overflows, the
3073 | largest integer with the same sign as `a' is returned.
3074 *----------------------------------------------------------------------------*/
3076 int32 float64_to_int32(float64 a, float_status *status)
3079 int_fast16_t aExp, shiftCount;
3081 a = float64_squash_input_denormal(a, status);
3083 aSig = extractFloat64Frac( a );
3084 aExp = extractFloat64Exp( a );
3085 aSign = extractFloat64Sign( a );
3086 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3087 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3088 shiftCount = 0x42C - aExp;
3089 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3090 return roundAndPackInt32(aSign, aSig, status);
3094 /*----------------------------------------------------------------------------
3095 | Returns the result of converting the double-precision floating-point value
3096 | `a' to the 32-bit two's complement integer format. The conversion is
3097 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3098 | Arithmetic, except that the conversion is always rounded toward zero.
3099 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3100 | the conversion overflows, the largest integer with the same sign as `a' is
3102 *----------------------------------------------------------------------------*/
3104 int32 float64_to_int32_round_to_zero(float64 a, float_status *status)
3107 int_fast16_t aExp, shiftCount;
3108 uint64_t aSig, savedASig;
3110 a = float64_squash_input_denormal(a, status);
3112 aSig = extractFloat64Frac( a );
3113 aExp = extractFloat64Exp( a );
3114 aSign = extractFloat64Sign( a );
3115 if ( 0x41E < aExp ) {
3116 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3119 else if ( aExp < 0x3FF ) {
3121 status->float_exception_flags |= float_flag_inexact;
3125 aSig |= LIT64( 0x0010000000000000 );
3126 shiftCount = 0x433 - aExp;
3128 aSig >>= shiftCount;
3130 if ( aSign ) z = - z;
3131 if ( ( z < 0 ) ^ aSign ) {
3133 float_raise(float_flag_invalid, status);
3134 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
3136 if ( ( aSig<<shiftCount ) != savedASig ) {
3137 status->float_exception_flags |= float_flag_inexact;
3143 /*----------------------------------------------------------------------------
3144 | Returns the result of converting the double-precision floating-point value
3145 | `a' to the 16-bit two's complement integer format. The conversion is
3146 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3147 | Arithmetic, except that the conversion is always rounded toward zero.
3148 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3149 | the conversion overflows, the largest integer with the same sign as `a' is
3151 *----------------------------------------------------------------------------*/
3153 int_fast16_t float64_to_int16_round_to_zero(float64 a, float_status *status)
3156 int_fast16_t aExp, shiftCount;
3157 uint64_t aSig, savedASig;
3160 aSig = extractFloat64Frac( a );
3161 aExp = extractFloat64Exp( a );
3162 aSign = extractFloat64Sign( a );
3163 if ( 0x40E < aExp ) {
3164 if ( ( aExp == 0x7FF ) && aSig ) {
3169 else if ( aExp < 0x3FF ) {
3170 if ( aExp || aSig ) {
3171 status->float_exception_flags |= float_flag_inexact;
3175 aSig |= LIT64( 0x0010000000000000 );
3176 shiftCount = 0x433 - aExp;
3178 aSig >>= shiftCount;
3183 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3185 float_raise(float_flag_invalid, status);
3186 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
3188 if ( ( aSig<<shiftCount ) != savedASig ) {
3189 status->float_exception_flags |= float_flag_inexact;
3194 /*----------------------------------------------------------------------------
3195 | Returns the result of converting the double-precision floating-point value
3196 | `a' to the 64-bit two's complement integer format. The conversion is
3197 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3198 | Arithmetic---which means in particular that the conversion is rounded
3199 | according to the current rounding mode. If `a' is a NaN, the largest
3200 | positive integer is returned. Otherwise, if the conversion overflows, the
3201 | largest integer with the same sign as `a' is returned.
3202 *----------------------------------------------------------------------------*/
3204 int64 float64_to_int64(float64 a, float_status *status)
3207 int_fast16_t aExp, shiftCount;
3208 uint64_t aSig, aSigExtra;
3209 a = float64_squash_input_denormal(a, status);
3211 aSig = extractFloat64Frac( a );
3212 aExp = extractFloat64Exp( a );
3213 aSign = extractFloat64Sign( a );
3214 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3215 shiftCount = 0x433 - aExp;
3216 if ( shiftCount <= 0 ) {
3217 if ( 0x43E < aExp ) {
3218 float_raise(float_flag_invalid, status);
3220 || ( ( aExp == 0x7FF )
3221 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3223 return LIT64( 0x7FFFFFFFFFFFFFFF );
3225 return (int64_t) LIT64( 0x8000000000000000 );
3228 aSig <<= - shiftCount;
3231 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3233 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
3237 /*----------------------------------------------------------------------------
3238 | Returns the result of converting the double-precision floating-point value
3239 | `a' to the 64-bit two's complement integer format. The conversion is
3240 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3241 | Arithmetic, except that the conversion is always rounded toward zero.
3242 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3243 | the conversion overflows, the largest integer with the same sign as `a' is
3245 *----------------------------------------------------------------------------*/
3247 int64 float64_to_int64_round_to_zero(float64 a, float_status *status)
3250 int_fast16_t aExp, shiftCount;
3253 a = float64_squash_input_denormal(a, status);
3255 aSig = extractFloat64Frac( a );
3256 aExp = extractFloat64Exp( a );
3257 aSign = extractFloat64Sign( a );
3258 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3259 shiftCount = aExp - 0x433;
3260 if ( 0 <= shiftCount ) {
3261 if ( 0x43E <= aExp ) {
3262 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
3263 float_raise(float_flag_invalid, status);
3265 || ( ( aExp == 0x7FF )
3266 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3268 return LIT64( 0x7FFFFFFFFFFFFFFF );
3271 return (int64_t) LIT64( 0x8000000000000000 );
3273 z = aSig<<shiftCount;
3276 if ( aExp < 0x3FE ) {
3278 status->float_exception_flags |= float_flag_inexact;
3282 z = aSig>>( - shiftCount );
3283 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
3284 status->float_exception_flags |= float_flag_inexact;
3287 if ( aSign ) z = - z;
3292 /*----------------------------------------------------------------------------
3293 | Returns the result of converting the double-precision floating-point value
3294 | `a' to the single-precision floating-point format. The conversion is
3295 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3297 *----------------------------------------------------------------------------*/
3299 float32 float64_to_float32(float64 a, float_status *status)
3305 a = float64_squash_input_denormal(a, status);
3307 aSig = extractFloat64Frac( a );
3308 aExp = extractFloat64Exp( a );
3309 aSign = extractFloat64Sign( a );
3310 if ( aExp == 0x7FF ) {
3312 return commonNaNToFloat32(float64ToCommonNaN(a, status), status);
3314 return packFloat32( aSign, 0xFF, 0 );
3316 shift64RightJamming( aSig, 22, &aSig );
3318 if ( aExp || zSig ) {
3322 return roundAndPackFloat32(aSign, aExp, zSig, status);
3327 /*----------------------------------------------------------------------------
3328 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3329 | half-precision floating-point value, returning the result. After being
3330 | shifted into the proper positions, the three fields are simply added
3331 | together to form the result. This means that any integer portion of `zSig'
3332 | will be added into the exponent. Since a properly normalized significand
3333 | will have an integer portion equal to 1, the `zExp' input should be 1 less
3334 | than the desired result exponent whenever `zSig' is a complete, normalized
3336 *----------------------------------------------------------------------------*/
3337 static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
3339 return make_float16(
3340 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
3343 /*----------------------------------------------------------------------------
3344 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3345 | and significand `zSig', and returns the proper half-precision floating-
3346 | point value corresponding to the abstract input. Ordinarily, the abstract
3347 | value is simply rounded and packed into the half-precision format, with
3348 | the inexact exception raised if the abstract input cannot be represented
3349 | exactly. However, if the abstract value is too large, the overflow and
3350 | inexact exceptions are raised and an infinity or maximal finite value is
3351 | returned. If the abstract value is too small, the input value is rounded to
3352 | a subnormal number, and the underflow and inexact exceptions are raised if
3353 | the abstract input cannot be represented exactly as a subnormal half-
3354 | precision floating-point number.
3355 | The `ieee' flag indicates whether to use IEEE standard half precision, or
3356 | ARM-style "alternative representation", which omits the NaN and Inf
3357 | encodings in order to raise the maximum representable exponent by one.
3358 | The input significand `zSig' has its binary point between bits 22
3359 | and 23, which is 13 bits to the left of the usual location. This shifted
3360 | significand must be normalized or smaller. If `zSig' is not normalized,
3361 | `zExp' must be 0; in that case, the result returned is a subnormal number,
3362 | and it must not require rounding. In the usual case that `zSig' is
3363 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3364 | Note the slightly odd position of the binary point in zSig compared with the
3365 | other roundAndPackFloat functions. This should probably be fixed if we
3366 | need to implement more float16 routines than just conversion.
3367 | The handling of underflow and overflow follows the IEC/IEEE Standard for
3368 | Binary Floating-Point Arithmetic.
3369 *----------------------------------------------------------------------------*/
3371 static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3372 uint32_t zSig, flag ieee,
3373 float_status *status)
3375 int maxexp = ieee ? 29 : 30;
3378 bool rounding_bumps_exp;
3379 bool is_tiny = false;
3381 /* Calculate the mask of bits of the mantissa which are not
3382 * representable in half-precision and will be lost.
3385 /* Will be denormal in halfprec */
3391 /* Normal number in halfprec */
3395 switch (status->float_rounding_mode) {
3396 case float_round_nearest_even:
3397 increment = (mask + 1) >> 1;
3398 if ((zSig & mask) == increment) {
3399 increment = zSig & (increment << 1);
3402 case float_round_ties_away:
3403 increment = (mask + 1) >> 1;
3405 case float_round_up:
3406 increment = zSign ? 0 : mask;
3408 case float_round_down:
3409 increment = zSign ? mask : 0;
3411 default: /* round_to_zero */
3416 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3418 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3420 float_raise(float_flag_overflow | float_flag_inexact, status);
3421 return packFloat16(zSign, 0x1f, 0);
3423 float_raise(float_flag_invalid, status);
3424 return packFloat16(zSign, 0x1f, 0x3ff);
3429 /* Note that flush-to-zero does not affect half-precision results */
3431 (status->float_detect_tininess == float_tininess_before_rounding)
3433 || (!rounding_bumps_exp);
3436 float_raise(float_flag_inexact, status);
3438 float_raise(float_flag_underflow, status);
3443 if (rounding_bumps_exp) {
3449 return packFloat16(zSign, 0, 0);
3455 return packFloat16(zSign, zExp, zSig >> 13);
3458 static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3461 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3462 *zSigPtr = aSig << shiftCount;
3463 *zExpPtr = 1 - shiftCount;
3466 /* Half precision floats come in two formats: standard IEEE and "ARM" format.
3467 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
3469 float32 float16_to_float32(float16 a, flag ieee, float_status *status)
3475 aSign = extractFloat16Sign(a);
3476 aExp = extractFloat16Exp(a);
3477 aSig = extractFloat16Frac(a);
3479 if (aExp == 0x1f && ieee) {
3481 return commonNaNToFloat32(float16ToCommonNaN(a, status), status);
3483 return packFloat32(aSign, 0xff, 0);
3487 return packFloat32(aSign, 0, 0);
3490 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3493 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3496 float16 float32_to_float16(float32 a, flag ieee, float_status *status)
3502 a = float32_squash_input_denormal(a, status);
3504 aSig = extractFloat32Frac( a );
3505 aExp = extractFloat32Exp( a );
3506 aSign = extractFloat32Sign( a );
3507 if ( aExp == 0xFF ) {
3509 /* Input is a NaN */
3511 float_raise(float_flag_invalid, status);
3512 return packFloat16(aSign, 0, 0);
3514 return commonNaNToFloat16(
3515 float32ToCommonNaN(a, status), status);
3519 float_raise(float_flag_invalid, status);
3520 return packFloat16(aSign, 0x1f, 0x3ff);
3522 return packFloat16(aSign, 0x1f, 0);
3524 if (aExp == 0 && aSig == 0) {
3525 return packFloat16(aSign, 0, 0);
3527 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3528 * even if the input is denormal; however this is harmless because
3529 * the largest possible single-precision denormal is still smaller
3530 * than the smallest representable half-precision denormal, and so we
3531 * will end up ignoring aSig and returning via the "always return zero"
3537 return roundAndPackFloat16(aSign, aExp, aSig, ieee, status);
3540 float64 float16_to_float64(float16 a, flag ieee, float_status *status)
3546 aSign = extractFloat16Sign(a);
3547 aExp = extractFloat16Exp(a);
3548 aSig = extractFloat16Frac(a);
3550 if (aExp == 0x1f && ieee) {
3552 return commonNaNToFloat64(
3553 float16ToCommonNaN(a, status), status);
3555 return packFloat64(aSign, 0x7ff, 0);
3559 return packFloat64(aSign, 0, 0);
3562 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3565 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3568 float16 float64_to_float16(float64 a, flag ieee, float_status *status)
3575 a = float64_squash_input_denormal(a, status);
3577 aSig = extractFloat64Frac(a);
3578 aExp = extractFloat64Exp(a);
3579 aSign = extractFloat64Sign(a);
3580 if (aExp == 0x7FF) {
3582 /* Input is a NaN */
3584 float_raise(float_flag_invalid, status);
3585 return packFloat16(aSign, 0, 0);
3587 return commonNaNToFloat16(
3588 float64ToCommonNaN(a, status), status);
3592 float_raise(float_flag_invalid, status);
3593 return packFloat16(aSign, 0x1f, 0x3ff);
3595 return packFloat16(aSign, 0x1f, 0);
3597 shift64RightJamming(aSig, 29, &aSig);
3599 if (aExp == 0 && zSig == 0) {
3600 return packFloat16(aSign, 0, 0);
3602 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3603 * even if the input is denormal; however this is harmless because
3604 * the largest possible single-precision denormal is still smaller
3605 * than the smallest representable half-precision denormal, and so we
3606 * will end up ignoring aSig and returning via the "always return zero"
3612 return roundAndPackFloat16(aSign, aExp, zSig, ieee, status);
3615 /*----------------------------------------------------------------------------
3616 | Returns the result of converting the double-precision floating-point value
3617 | `a' to the extended double-precision floating-point format. The conversion
3618 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
3620 *----------------------------------------------------------------------------*/
3622 floatx80 float64_to_floatx80(float64 a, float_status *status)
3628 a = float64_squash_input_denormal(a, status);
3629 aSig = extractFloat64Frac( a );
3630 aExp = extractFloat64Exp( a );
3631 aSign = extractFloat64Sign( a );
3632 if ( aExp == 0x7FF ) {
3634 return commonNaNToFloatx80(float64ToCommonNaN(a, status), status);
3636 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3639 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3640 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3644 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3648 /*----------------------------------------------------------------------------
3649 | Returns the result of converting the double-precision floating-point value
3650 | `a' to the quadruple-precision floating-point format. The conversion is
3651 | performed according to the IEC/IEEE Standard for Binary Floating-Point
3653 *----------------------------------------------------------------------------*/
3655 float128 float64_to_float128(float64 a, float_status *status)
3659 uint64_t aSig, zSig0, zSig1;
3661 a = float64_squash_input_denormal(a, status);
3662 aSig = extractFloat64Frac( a );
3663 aExp = extractFloat64Exp( a );
3664 aSign = extractFloat64Sign( a );
3665 if ( aExp == 0x7FF ) {
3667 return commonNaNToFloat128(float64ToCommonNaN(a, status), status);
3669 return packFloat128( aSign, 0x7FFF, 0, 0 );
3672 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3673 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3676 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3677 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3681 /*----------------------------------------------------------------------------
3682 | Rounds the double-precision floating-point value `a' to an integer, and
3683 | returns the result as a double-precision floating-point value. The
3684 | operation is performed according to the IEC/IEEE Standard for Binary
3685 | Floating-Point Arithmetic.
3686 *----------------------------------------------------------------------------*/
3688 float64 float64_round_to_int(float64 a, float_status *status)
3692 uint64_t lastBitMask, roundBitsMask;
3694 a = float64_squash_input_denormal(a, status);
3696 aExp = extractFloat64Exp( a );
3697 if ( 0x433 <= aExp ) {
3698 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3699 return propagateFloat64NaN(a, a, status);
3703 if ( aExp < 0x3FF ) {
3704 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
3705 status->float_exception_flags |= float_flag_inexact;
3706 aSign = extractFloat64Sign( a );
3707 switch (status->float_rounding_mode) {
3708 case float_round_nearest_even:
3709 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3710 return packFloat64( aSign, 0x3FF, 0 );
3713 case float_round_ties_away:
3714 if (aExp == 0x3FE) {
3715 return packFloat64(aSign, 0x3ff, 0);
3718 case float_round_down:
3719 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
3720 case float_round_up:
3721 return make_float64(
3722 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
3724 return packFloat64( aSign, 0, 0 );
3727 lastBitMask <<= 0x433 - aExp;
3728 roundBitsMask = lastBitMask - 1;
3730 switch (status->float_rounding_mode) {
3731 case float_round_nearest_even:
3732 z += lastBitMask >> 1;
3733 if ((z & roundBitsMask) == 0) {
3737 case float_round_ties_away:
3738 z += lastBitMask >> 1;
3740 case float_round_to_zero:
3742 case float_round_up:
3743 if (!extractFloat64Sign(make_float64(z))) {
3747 case float_round_down:
3748 if (extractFloat64Sign(make_float64(z))) {
3755 z &= ~ roundBitsMask;
3756 if (z != float64_val(a)) {
3757 status->float_exception_flags |= float_flag_inexact;
3759 return make_float64(z);
3763 float64 float64_trunc_to_int(float64 a, float_status *status)
3767 oldmode = status->float_rounding_mode;
3768 status->float_rounding_mode = float_round_to_zero;
3769 res = float64_round_to_int(a, status);
3770 status->float_rounding_mode = oldmode;
3774 /*----------------------------------------------------------------------------
3775 | Returns the result of adding the absolute values of the double-precision
3776 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3777 | before being returned. `zSign' is ignored if the result is a NaN.
3778 | The addition is performed according to the IEC/IEEE Standard for Binary
3779 | Floating-Point Arithmetic.
3780 *----------------------------------------------------------------------------*/
3782 static float64 addFloat64Sigs(float64 a, float64 b, flag zSign,
3783 float_status *status)
3785 int_fast16_t aExp, bExp, zExp;
3786 uint64_t aSig, bSig, zSig;
3787 int_fast16_t expDiff;
3789 aSig = extractFloat64Frac( a );
3790 aExp = extractFloat64Exp( a );
3791 bSig = extractFloat64Frac( b );
3792 bExp = extractFloat64Exp( b );
3793 expDiff = aExp - bExp;
3796 if ( 0 < expDiff ) {
3797 if ( aExp == 0x7FF ) {
3799 return propagateFloat64NaN(a, b, status);
3807 bSig |= LIT64( 0x2000000000000000 );
3809 shift64RightJamming( bSig, expDiff, &bSig );
3812 else if ( expDiff < 0 ) {
3813 if ( bExp == 0x7FF ) {
3815 return propagateFloat64NaN(a, b, status);
3817 return packFloat64( zSign, 0x7FF, 0 );
3823 aSig |= LIT64( 0x2000000000000000 );
3825 shift64RightJamming( aSig, - expDiff, &aSig );
3829 if ( aExp == 0x7FF ) {
3831 return propagateFloat64NaN(a, b, status);
3836 if (status->flush_to_zero) {
3838 float_raise(float_flag_output_denormal, status);
3840 return packFloat64(zSign, 0, 0);
3842 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3844 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3848 aSig |= LIT64( 0x2000000000000000 );
3849 zSig = ( aSig + bSig )<<1;
3851 if ( (int64_t) zSig < 0 ) {
3856 return roundAndPackFloat64(zSign, zExp, zSig, status);
3860 /*----------------------------------------------------------------------------
3861 | Returns the result of subtracting the absolute values of the double-
3862 | precision floating-point values `a' and `b'. If `zSign' is 1, the
3863 | difference is negated before being returned. `zSign' is ignored if the
3864 | result is a NaN. The subtraction is performed according to the IEC/IEEE
3865 | Standard for Binary Floating-Point Arithmetic.
3866 *----------------------------------------------------------------------------*/
3868 static float64 subFloat64Sigs(float64 a, float64 b, flag zSign,
3869 float_status *status)
3871 int_fast16_t aExp, bExp, zExp;
3872 uint64_t aSig, bSig, zSig;
3873 int_fast16_t expDiff;
3875 aSig = extractFloat64Frac( a );
3876 aExp = extractFloat64Exp( a );
3877 bSig = extractFloat64Frac( b );
3878 bExp = extractFloat64Exp( b );
3879 expDiff = aExp - bExp;
3882 if ( 0 < expDiff ) goto aExpBigger;
3883 if ( expDiff < 0 ) goto bExpBigger;
3884 if ( aExp == 0x7FF ) {
3886 return propagateFloat64NaN(a, b, status);
3888 float_raise(float_flag_invalid, status);
3889 return float64_default_nan;
3895 if ( bSig < aSig ) goto aBigger;
3896 if ( aSig < bSig ) goto bBigger;
3897 return packFloat64(status->float_rounding_mode == float_round_down, 0, 0);
3899 if ( bExp == 0x7FF ) {
3901 return propagateFloat64NaN(a, b, status);
3903 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3909 aSig |= LIT64( 0x4000000000000000 );
3911 shift64RightJamming( aSig, - expDiff, &aSig );
3912 bSig |= LIT64( 0x4000000000000000 );
3917 goto normalizeRoundAndPack;
3919 if ( aExp == 0x7FF ) {
3921 return propagateFloat64NaN(a, b, status);
3929 bSig |= LIT64( 0x4000000000000000 );
3931 shift64RightJamming( bSig, expDiff, &bSig );
3932 aSig |= LIT64( 0x4000000000000000 );
3936 normalizeRoundAndPack:
3938 return normalizeRoundAndPackFloat64(zSign, zExp, zSig, status);
3942 /*----------------------------------------------------------------------------
3943 | Returns the result of adding the double-precision floating-point values `a'
3944 | and `b'. The operation is performed according to the IEC/IEEE Standard for
3945 | Binary Floating-Point Arithmetic.
3946 *----------------------------------------------------------------------------*/
3948 float64 float64_add(float64 a, float64 b, float_status *status)
3951 a = float64_squash_input_denormal(a, status);
3952 b = float64_squash_input_denormal(b, status);
3954 aSign = extractFloat64Sign( a );
3955 bSign = extractFloat64Sign( b );
3956 if ( aSign == bSign ) {
3957 return addFloat64Sigs(a, b, aSign, status);
3960 return subFloat64Sigs(a, b, aSign, status);
3965 /*----------------------------------------------------------------------------
3966 | Returns the result of subtracting the double-precision floating-point values
3967 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3968 | for Binary Floating-Point Arithmetic.
3969 *----------------------------------------------------------------------------*/
3971 float64 float64_sub(float64 a, float64 b, float_status *status)
3974 a = float64_squash_input_denormal(a, status);
3975 b = float64_squash_input_denormal(b, status);
3977 aSign = extractFloat64Sign( a );
3978 bSign = extractFloat64Sign( b );
3979 if ( aSign == bSign ) {
3980 return subFloat64Sigs(a, b, aSign, status);
3983 return addFloat64Sigs(a, b, aSign, status);
3988 /*----------------------------------------------------------------------------
3989 | Returns the result of multiplying the double-precision floating-point values
3990 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3991 | for Binary Floating-Point Arithmetic.
3992 *----------------------------------------------------------------------------*/
3994 float64 float64_mul(float64 a, float64 b, float_status *status)
3996 flag aSign, bSign, zSign;
3997 int_fast16_t aExp, bExp, zExp;
3998 uint64_t aSig, bSig, zSig0, zSig1;
4000 a = float64_squash_input_denormal(a, status);
4001 b = float64_squash_input_denormal(b, status);
4003 aSig = extractFloat64Frac( a );
4004 aExp = extractFloat64Exp( a );
4005 aSign = extractFloat64Sign( a );
4006 bSig = extractFloat64Frac( b );
4007 bExp = extractFloat64Exp( b );
4008 bSign = extractFloat64Sign( b );
4009 zSign = aSign ^ bSign;
4010 if ( aExp == 0x7FF ) {
4011 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4012 return propagateFloat64NaN(a, b, status);
4014 if ( ( bExp | bSig ) == 0 ) {
4015 float_raise(float_flag_invalid, status);
4016 return float64_default_nan;
4018 return packFloat64( zSign, 0x7FF, 0 );
4020 if ( bExp == 0x7FF ) {
4022 return propagateFloat64NaN(a, b, status);
4024 if ( ( aExp | aSig ) == 0 ) {
4025 float_raise(float_flag_invalid, status);
4026 return float64_default_nan;
4028 return packFloat64( zSign, 0x7FF, 0 );
4031 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4032 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4035 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
4036 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4038 zExp = aExp + bExp - 0x3FF;
4039 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4040 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4041 mul64To128( aSig, bSig, &zSig0, &zSig1 );
4042 zSig0 |= ( zSig1 != 0 );
4043 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
4047 return roundAndPackFloat64(zSign, zExp, zSig0, status);
4051 /*----------------------------------------------------------------------------
4052 | Returns the result of dividing the double-precision floating-point value `a'
4053 | by the corresponding value `b'. The operation is performed according to
4054 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4055 *----------------------------------------------------------------------------*/
4057 float64 float64_div(float64 a, float64 b, float_status *status)
4059 flag aSign, bSign, zSign;
4060 int_fast16_t aExp, bExp, zExp;
4061 uint64_t aSig, bSig, zSig;
4062 uint64_t rem0, rem1;
4063 uint64_t term0, term1;
4064 a = float64_squash_input_denormal(a, status);
4065 b = float64_squash_input_denormal(b, status);
4067 aSig = extractFloat64Frac( a );
4068 aExp = extractFloat64Exp( a );
4069 aSign = extractFloat64Sign( a );
4070 bSig = extractFloat64Frac( b );
4071 bExp = extractFloat64Exp( b );
4072 bSign = extractFloat64Sign( b );
4073 zSign = aSign ^ bSign;
4074 if ( aExp == 0x7FF ) {
4076 return propagateFloat64NaN(a, b, status);
4078 if ( bExp == 0x7FF ) {
4080 return propagateFloat64NaN(a, b, status);
4082 float_raise(float_flag_invalid, status);
4083 return float64_default_nan;
4085 return packFloat64( zSign, 0x7FF, 0 );
4087 if ( bExp == 0x7FF ) {
4089 return propagateFloat64NaN(a, b, status);
4091 return packFloat64( zSign, 0, 0 );
4095 if ( ( aExp | aSig ) == 0 ) {
4096 float_raise(float_flag_invalid, status);
4097 return float64_default_nan;
4099 float_raise(float_flag_divbyzero, status);
4100 return packFloat64( zSign, 0x7FF, 0 );
4102 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4105 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
4106 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4108 zExp = aExp - bExp + 0x3FD;
4109 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
4110 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4111 if ( bSig <= ( aSig + aSig ) ) {
4115 zSig = estimateDiv128To64( aSig, 0, bSig );
4116 if ( ( zSig & 0x1FF ) <= 2 ) {
4117 mul64To128( bSig, zSig, &term0, &term1 );
4118 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4119 while ( (int64_t) rem0 < 0 ) {
4121 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4123 zSig |= ( rem1 != 0 );
4125 return roundAndPackFloat64(zSign, zExp, zSig, status);
4129 /*----------------------------------------------------------------------------
4130 | Returns the remainder of the double-precision floating-point value `a'
4131 | with respect to the corresponding value `b'. The operation is performed
4132 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4133 *----------------------------------------------------------------------------*/
4135 float64 float64_rem(float64 a, float64 b, float_status *status)
4138 int_fast16_t aExp, bExp, expDiff;
4139 uint64_t aSig, bSig;
4140 uint64_t q, alternateASig;
4143 a = float64_squash_input_denormal(a, status);
4144 b = float64_squash_input_denormal(b, status);
4145 aSig = extractFloat64Frac( a );
4146 aExp = extractFloat64Exp( a );
4147 aSign = extractFloat64Sign( a );
4148 bSig = extractFloat64Frac( b );
4149 bExp = extractFloat64Exp( b );
4150 if ( aExp == 0x7FF ) {
4151 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4152 return propagateFloat64NaN(a, b, status);
4154 float_raise(float_flag_invalid, status);
4155 return float64_default_nan;
4157 if ( bExp == 0x7FF ) {
4159 return propagateFloat64NaN(a, b, status);
4165 float_raise(float_flag_invalid, status);
4166 return float64_default_nan;
4168 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4171 if ( aSig == 0 ) return a;
4172 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4174 expDiff = aExp - bExp;
4175 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4176 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4177 if ( expDiff < 0 ) {
4178 if ( expDiff < -1 ) return a;
4181 q = ( bSig <= aSig );
4182 if ( q ) aSig -= bSig;
4184 while ( 0 < expDiff ) {
4185 q = estimateDiv128To64( aSig, 0, bSig );
4186 q = ( 2 < q ) ? q - 2 : 0;
4187 aSig = - ( ( bSig>>2 ) * q );
4191 if ( 0 < expDiff ) {
4192 q = estimateDiv128To64( aSig, 0, bSig );
4193 q = ( 2 < q ) ? q - 2 : 0;
4196 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4203 alternateASig = aSig;
4206 } while ( 0 <= (int64_t) aSig );
4207 sigMean = aSig + alternateASig;
4208 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4209 aSig = alternateASig;
4211 zSign = ( (int64_t) aSig < 0 );
4212 if ( zSign ) aSig = - aSig;
4213 return normalizeRoundAndPackFloat64(aSign ^ zSign, bExp, aSig, status);
4217 /*----------------------------------------------------------------------------
4218 | Returns the result of multiplying the double-precision floating-point values
4219 | `a' and `b' then adding 'c', with no intermediate rounding step after the
4220 | multiplication. The operation is performed according to the IEC/IEEE
4221 | Standard for Binary Floating-Point Arithmetic 754-2008.
4222 | The flags argument allows the caller to select negation of the
4223 | addend, the intermediate product, or the final result. (The difference
4224 | between this and having the caller do a separate negation is that negating
4225 | externally will flip the sign bit on NaNs.)
4226 *----------------------------------------------------------------------------*/
4228 float64 float64_muladd(float64 a, float64 b, float64 c, int flags,
4229 float_status *status)
4231 flag aSign, bSign, cSign, zSign;
4232 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
4233 uint64_t aSig, bSig, cSig;
4234 flag pInf, pZero, pSign;
4235 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4237 flag signflip, infzero;
4239 a = float64_squash_input_denormal(a, status);
4240 b = float64_squash_input_denormal(b, status);
4241 c = float64_squash_input_denormal(c, status);
4242 aSig = extractFloat64Frac(a);
4243 aExp = extractFloat64Exp(a);
4244 aSign = extractFloat64Sign(a);
4245 bSig = extractFloat64Frac(b);
4246 bExp = extractFloat64Exp(b);
4247 bSign = extractFloat64Sign(b);
4248 cSig = extractFloat64Frac(c);
4249 cExp = extractFloat64Exp(c);
4250 cSign = extractFloat64Sign(c);
4252 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4253 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4255 /* It is implementation-defined whether the cases of (0,inf,qnan)
4256 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4257 * they return if they do), so we have to hand this information
4258 * off to the target-specific pick-a-NaN routine.
4260 if (((aExp == 0x7ff) && aSig) ||
4261 ((bExp == 0x7ff) && bSig) ||
4262 ((cExp == 0x7ff) && cSig)) {
4263 return propagateFloat64MulAddNaN(a, b, c, infzero, status);
4267 float_raise(float_flag_invalid, status);
4268 return float64_default_nan;
4271 if (flags & float_muladd_negate_c) {
4275 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4277 /* Work out the sign and type of the product */
4278 pSign = aSign ^ bSign;
4279 if (flags & float_muladd_negate_product) {
4282 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4283 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4285 if (cExp == 0x7ff) {
4286 if (pInf && (pSign ^ cSign)) {
4287 /* addition of opposite-signed infinities => InvalidOperation */
4288 float_raise(float_flag_invalid, status);
4289 return float64_default_nan;
4291 /* Otherwise generate an infinity of the same sign */
4292 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4296 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4302 /* Adding two exact zeroes */
4303 if (pSign == cSign) {
4305 } else if (status->float_rounding_mode == float_round_down) {
4310 return packFloat64(zSign ^ signflip, 0, 0);
4312 /* Exact zero plus a denorm */
4313 if (status->flush_to_zero) {
4314 float_raise(float_flag_output_denormal, status);
4315 return packFloat64(cSign ^ signflip, 0, 0);
4318 /* Zero plus something non-zero : just return the something */
4319 if (flags & float_muladd_halve_result) {
4321 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4323 /* Subtract one to halve, and one again because roundAndPackFloat64
4324 * wants one less than the true exponent.
4327 cSig = (cSig | 0x0010000000000000ULL) << 10;
4328 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig, status);
4330 return packFloat64(cSign ^ signflip, cExp, cSig);
4334 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4337 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4340 /* Calculate the actual result a * b + c */
4342 /* Multiply first; this is easy. */
4343 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4344 * because we want the true exponent, not the "one-less-than"
4345 * flavour that roundAndPackFloat64() takes.
4347 pExp = aExp + bExp - 0x3fe;
4348 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4349 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4350 mul64To128(aSig, bSig, &pSig0, &pSig1);
4351 if ((int64_t)(pSig0 << 1) >= 0) {
4352 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4356 zSign = pSign ^ signflip;
4358 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4359 * bit in position 126.
4363 /* Throw out the special case of c being an exact zero now */
4364 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
4365 if (flags & float_muladd_halve_result) {
4368 return roundAndPackFloat64(zSign, pExp - 1,
4371 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4374 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4375 * significand of the addend, with the explicit bit in position 126.
4377 cSig0 = cSig << (126 - 64 - 52);
4379 cSig0 |= LIT64(0x4000000000000000);
4380 expDiff = pExp - cExp;
4382 if (pSign == cSign) {
4385 /* scale c to match p */
4386 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4388 } else if (expDiff < 0) {
4389 /* scale p to match c */
4390 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4393 /* no scaling needed */
4396 /* Add significands and make sure explicit bit ends up in posn 126 */
4397 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4398 if ((int64_t)zSig0 < 0) {
4399 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4403 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
4404 if (flags & float_muladd_halve_result) {
4407 return roundAndPackFloat64(zSign, zExp, zSig1, status);
4411 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4412 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4414 } else if (expDiff < 0) {
4415 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4416 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4421 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4422 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4423 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4424 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4429 if (status->float_rounding_mode == float_round_down) {
4432 return packFloat64(zSign, 0, 0);
4436 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4437 * starting with the significand in a pair of uint64_t.
4440 shiftcount = countLeadingZeros64(zSig0) - 1;
4441 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4447 shiftcount = countLeadingZeros64(zSig1);
4448 if (shiftcount == 0) {
4449 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4453 zSig0 = zSig1 << shiftcount;
4454 zExp -= (shiftcount + 64);
4457 if (flags & float_muladd_halve_result) {
4460 return roundAndPackFloat64(zSign, zExp, zSig0, status);
4464 /*----------------------------------------------------------------------------
4465 | Returns the square root of the double-precision floating-point value `a'.
4466 | The operation is performed according to the IEC/IEEE Standard for Binary
4467 | Floating-Point Arithmetic.
4468 *----------------------------------------------------------------------------*/
4470 float64 float64_sqrt(float64 a, float_status *status)
4473 int_fast16_t aExp, zExp;
4474 uint64_t aSig, zSig, doubleZSig;
4475 uint64_t rem0, rem1, term0, term1;
4476 a = float64_squash_input_denormal(a, status);
4478 aSig = extractFloat64Frac( a );
4479 aExp = extractFloat64Exp( a );
4480 aSign = extractFloat64Sign( a );
4481 if ( aExp == 0x7FF ) {
4483 return propagateFloat64NaN(a, a, status);
4485 if ( ! aSign ) return a;
4486 float_raise(float_flag_invalid, status);
4487 return float64_default_nan;
4490 if ( ( aExp | aSig ) == 0 ) return a;
4491 float_raise(float_flag_invalid, status);
4492 return float64_default_nan;
4495 if ( aSig == 0 ) return float64_zero;
4496 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4498 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4499 aSig |= LIT64( 0x0010000000000000 );
4500 zSig = estimateSqrt32( aExp, aSig>>21 );
4501 aSig <<= 9 - ( aExp & 1 );
4502 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4503 if ( ( zSig & 0x1FF ) <= 5 ) {
4504 doubleZSig = zSig<<1;
4505 mul64To128( zSig, zSig, &term0, &term1 );
4506 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
4507 while ( (int64_t) rem0 < 0 ) {
4510 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4512 zSig |= ( ( rem0 | rem1 ) != 0 );
4514 return roundAndPackFloat64(0, zExp, zSig, status);
4518 /*----------------------------------------------------------------------------
4519 | Returns the binary log of the double-precision floating-point value `a'.
4520 | The operation is performed according to the IEC/IEEE Standard for Binary
4521 | Floating-Point Arithmetic.
4522 *----------------------------------------------------------------------------*/
4523 float64 float64_log2(float64 a, float_status *status)
4527 uint64_t aSig, aSig0, aSig1, zSig, i;
4528 a = float64_squash_input_denormal(a, status);
4530 aSig = extractFloat64Frac( a );
4531 aExp = extractFloat64Exp( a );
4532 aSign = extractFloat64Sign( a );
4535 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4536 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4539 float_raise(float_flag_invalid, status);
4540 return float64_default_nan;
4542 if ( aExp == 0x7FF ) {
4544 return propagateFloat64NaN(a, float64_zero, status);
4550 aSig |= LIT64( 0x0010000000000000 );
4552 zSig = (uint64_t)aExp << 52;
4553 for (i = 1LL << 51; i > 0; i >>= 1) {
4554 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4555 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4556 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4564 return normalizeRoundAndPackFloat64(zSign, 0x408, zSig, status);
4567 /*----------------------------------------------------------------------------
4568 | Returns 1 if the double-precision floating-point value `a' is equal to the
4569 | corresponding value `b', and 0 otherwise. The invalid exception is raised
4570 | if either operand is a NaN. Otherwise, the comparison is performed
4571 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4572 *----------------------------------------------------------------------------*/
4574 int float64_eq(float64 a, float64 b, float_status *status)
4577 a = float64_squash_input_denormal(a, status);
4578 b = float64_squash_input_denormal(b, status);
4580 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4581 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4583 float_raise(float_flag_invalid, status);
4586 av = float64_val(a);
4587 bv = float64_val(b);
4588 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4592 /*----------------------------------------------------------------------------
4593 | Returns 1 if the double-precision floating-point value `a' is less than or
4594 | equal to the corresponding value `b', and 0 otherwise. The invalid
4595 | exception is raised if either operand is a NaN. The comparison is performed
4596 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4597 *----------------------------------------------------------------------------*/
4599 int float64_le(float64 a, float64 b, float_status *status)
4603 a = float64_squash_input_denormal(a, status);
4604 b = float64_squash_input_denormal(b, status);
4606 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4607 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4609 float_raise(float_flag_invalid, status);
4612 aSign = extractFloat64Sign( a );
4613 bSign = extractFloat64Sign( b );
4614 av = float64_val(a);
4615 bv = float64_val(b);
4616 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4617 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4621 /*----------------------------------------------------------------------------
4622 | Returns 1 if the double-precision floating-point value `a' is less than
4623 | the corresponding value `b', and 0 otherwise. The invalid exception is
4624 | raised if either operand is a NaN. The comparison is performed according
4625 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4626 *----------------------------------------------------------------------------*/
4628 int float64_lt(float64 a, float64 b, float_status *status)
4633 a = float64_squash_input_denormal(a, status);
4634 b = float64_squash_input_denormal(b, status);
4635 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4636 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4638 float_raise(float_flag_invalid, status);
4641 aSign = extractFloat64Sign( a );
4642 bSign = extractFloat64Sign( b );
4643 av = float64_val(a);
4644 bv = float64_val(b);
4645 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4646 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4650 /*----------------------------------------------------------------------------
4651 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4652 | be compared, and 0 otherwise. The invalid exception is raised if either
4653 | operand is a NaN. The comparison is performed according to the IEC/IEEE
4654 | Standard for Binary Floating-Point Arithmetic.
4655 *----------------------------------------------------------------------------*/
4657 int float64_unordered(float64 a, float64 b, float_status *status)
4659 a = float64_squash_input_denormal(a, status);
4660 b = float64_squash_input_denormal(b, status);
4662 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4663 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4665 float_raise(float_flag_invalid, status);
4671 /*----------------------------------------------------------------------------
4672 | Returns 1 if the double-precision floating-point value `a' is equal to the
4673 | corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4674 | exception.The comparison is performed according to the IEC/IEEE Standard
4675 | for Binary Floating-Point Arithmetic.
4676 *----------------------------------------------------------------------------*/
4678 int float64_eq_quiet(float64 a, float64 b, float_status *status)
4681 a = float64_squash_input_denormal(a, status);
4682 b = float64_squash_input_denormal(b, status);
4684 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4685 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4687 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4688 float_raise(float_flag_invalid, status);
4692 av = float64_val(a);
4693 bv = float64_val(b);
4694 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4698 /*----------------------------------------------------------------------------
4699 | Returns 1 if the double-precision floating-point value `a' is less than or
4700 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4701 | cause an exception. Otherwise, the comparison is performed according to the
4702 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4703 *----------------------------------------------------------------------------*/
4705 int float64_le_quiet(float64 a, float64 b, float_status *status)
4709 a = float64_squash_input_denormal(a, status);
4710 b = float64_squash_input_denormal(b, status);
4712 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4713 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4715 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4716 float_raise(float_flag_invalid, status);
4720 aSign = extractFloat64Sign( a );
4721 bSign = extractFloat64Sign( b );
4722 av = float64_val(a);
4723 bv = float64_val(b);
4724 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
4725 return ( av == bv ) || ( aSign ^ ( av < bv ) );
4729 /*----------------------------------------------------------------------------
4730 | Returns 1 if the double-precision floating-point value `a' is less than
4731 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4732 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
4733 | Standard for Binary Floating-Point Arithmetic.
4734 *----------------------------------------------------------------------------*/
4736 int float64_lt_quiet(float64 a, float64 b, float_status *status)
4740 a = float64_squash_input_denormal(a, status);
4741 b = float64_squash_input_denormal(b, status);
4743 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4744 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4746 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4747 float_raise(float_flag_invalid, status);
4751 aSign = extractFloat64Sign( a );
4752 bSign = extractFloat64Sign( b );
4753 av = float64_val(a);
4754 bv = float64_val(b);
4755 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
4756 return ( av != bv ) && ( aSign ^ ( av < bv ) );
4760 /*----------------------------------------------------------------------------
4761 | Returns 1 if the double-precision floating-point values `a' and `b' cannot
4762 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4763 | comparison is performed according to the IEC/IEEE Standard for Binary
4764 | Floating-Point Arithmetic.
4765 *----------------------------------------------------------------------------*/
4767 int float64_unordered_quiet(float64 a, float64 b, float_status *status)
4769 a = float64_squash_input_denormal(a, status);
4770 b = float64_squash_input_denormal(b, status);
4772 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4773 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4775 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4776 float_raise(float_flag_invalid, status);
4783 /*----------------------------------------------------------------------------
4784 | Returns the result of converting the extended double-precision floating-
4785 | point value `a' to the 32-bit two's complement integer format. The
4786 | conversion is performed according to the IEC/IEEE Standard for Binary
4787 | Floating-Point Arithmetic---which means in particular that the conversion
4788 | is rounded according to the current rounding mode. If `a' is a NaN, the
4789 | largest positive integer is returned. Otherwise, if the conversion
4790 | overflows, the largest integer with the same sign as `a' is returned.
4791 *----------------------------------------------------------------------------*/
4793 int32 floatx80_to_int32(floatx80 a, float_status *status)
4796 int32 aExp, shiftCount;
4799 aSig = extractFloatx80Frac( a );
4800 aExp = extractFloatx80Exp( a );
4801 aSign = extractFloatx80Sign( a );
4802 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4803 shiftCount = 0x4037 - aExp;
4804 if ( shiftCount <= 0 ) shiftCount = 1;
4805 shift64RightJamming( aSig, shiftCount, &aSig );
4806 return roundAndPackInt32(aSign, aSig, status);
4810 /*----------------------------------------------------------------------------
4811 | Returns the result of converting the extended double-precision floating-
4812 | point value `a' to the 32-bit two's complement integer format. The
4813 | conversion is performed according to the IEC/IEEE Standard for Binary
4814 | Floating-Point Arithmetic, except that the conversion is always rounded
4815 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4816 | Otherwise, if the conversion overflows, the largest integer with the same
4817 | sign as `a' is returned.
4818 *----------------------------------------------------------------------------*/
4820 int32 floatx80_to_int32_round_to_zero(floatx80 a, float_status *status)
4823 int32 aExp, shiftCount;
4824 uint64_t aSig, savedASig;
4827 aSig = extractFloatx80Frac( a );
4828 aExp = extractFloatx80Exp( a );
4829 aSign = extractFloatx80Sign( a );
4830 if ( 0x401E < aExp ) {
4831 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
4834 else if ( aExp < 0x3FFF ) {
4836 status->float_exception_flags |= float_flag_inexact;
4840 shiftCount = 0x403E - aExp;
4842 aSig >>= shiftCount;
4844 if ( aSign ) z = - z;
4845 if ( ( z < 0 ) ^ aSign ) {
4847 float_raise(float_flag_invalid, status);
4848 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
4850 if ( ( aSig<<shiftCount ) != savedASig ) {
4851 status->float_exception_flags |= float_flag_inexact;
4857 /*----------------------------------------------------------------------------
4858 | Returns the result of converting the extended double-precision floating-
4859 | point value `a' to the 64-bit two's complement integer format. The
4860 | conversion is performed according to the IEC/IEEE Standard for Binary
4861 | Floating-Point Arithmetic---which means in particular that the conversion
4862 | is rounded according to the current rounding mode. If `a' is a NaN,
4863 | the largest positive integer is returned. Otherwise, if the conversion
4864 | overflows, the largest integer with the same sign as `a' is returned.
4865 *----------------------------------------------------------------------------*/
4867 int64 floatx80_to_int64(floatx80 a, float_status *status)
4870 int32 aExp, shiftCount;
4871 uint64_t aSig, aSigExtra;
4873 aSig = extractFloatx80Frac( a );
4874 aExp = extractFloatx80Exp( a );
4875 aSign = extractFloatx80Sign( a );
4876 shiftCount = 0x403E - aExp;
4877 if ( shiftCount <= 0 ) {
4879 float_raise(float_flag_invalid, status);
4881 || ( ( aExp == 0x7FFF )
4882 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4884 return LIT64( 0x7FFFFFFFFFFFFFFF );
4886 return (int64_t) LIT64( 0x8000000000000000 );
4891 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4893 return roundAndPackInt64(aSign, aSig, aSigExtra, status);
4897 /*----------------------------------------------------------------------------
4898 | Returns the result of converting the extended double-precision floating-
4899 | point value `a' to the 64-bit two's complement integer format. The
4900 | conversion is performed according to the IEC/IEEE Standard for Binary
4901 | Floating-Point Arithmetic, except that the conversion is always rounded
4902 | toward zero. If `a' is a NaN, the largest positive integer is returned.
4903 | Otherwise, if the conversion overflows, the largest integer with the same
4904 | sign as `a' is returned.
4905 *----------------------------------------------------------------------------*/
4907 int64 floatx80_to_int64_round_to_zero(floatx80 a, float_status *status)
4910 int32 aExp, shiftCount;
4914 aSig = extractFloatx80Frac( a );
4915 aExp = extractFloatx80Exp( a );
4916 aSign = extractFloatx80Sign( a );
4917 shiftCount = aExp - 0x403E;
4918 if ( 0 <= shiftCount ) {
4919 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4920 if ( ( a.high != 0xC03E ) || aSig ) {
4921 float_raise(float_flag_invalid, status);
4922 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4923 return LIT64( 0x7FFFFFFFFFFFFFFF );
4926 return (int64_t) LIT64( 0x8000000000000000 );
4928 else if ( aExp < 0x3FFF ) {
4930 status->float_exception_flags |= float_flag_inexact;
4934 z = aSig>>( - shiftCount );
4935 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
4936 status->float_exception_flags |= float_flag_inexact;
4938 if ( aSign ) z = - z;
4943 /*----------------------------------------------------------------------------
4944 | Returns the result of converting the extended double-precision floating-
4945 | point value `a' to the single-precision floating-point format. The
4946 | conversion is performed according to the IEC/IEEE Standard for Binary
4947 | Floating-Point Arithmetic.
4948 *----------------------------------------------------------------------------*/
4950 float32 floatx80_to_float32(floatx80 a, float_status *status)
4956 aSig = extractFloatx80Frac( a );
4957 aExp = extractFloatx80Exp( a );
4958 aSign = extractFloatx80Sign( a );
4959 if ( aExp == 0x7FFF ) {
4960 if ( (uint64_t) ( aSig<<1 ) ) {
4961 return commonNaNToFloat32(floatx80ToCommonNaN(a, status), status);
4963 return packFloat32( aSign, 0xFF, 0 );
4965 shift64RightJamming( aSig, 33, &aSig );
4966 if ( aExp || aSig ) aExp -= 0x3F81;
4967 return roundAndPackFloat32(aSign, aExp, aSig, status);
4971 /*----------------------------------------------------------------------------
4972 | Returns the result of converting the extended double-precision floating-
4973 | point value `a' to the double-precision floating-point format. The
4974 | conversion is performed according to the IEC/IEEE Standard for Binary
4975 | Floating-Point Arithmetic.
4976 *----------------------------------------------------------------------------*/
4978 float64 floatx80_to_float64(floatx80 a, float_status *status)
4982 uint64_t aSig, zSig;
4984 aSig = extractFloatx80Frac( a );
4985 aExp = extractFloatx80Exp( a );
4986 aSign = extractFloatx80Sign( a );
4987 if ( aExp == 0x7FFF ) {
4988 if ( (uint64_t) ( aSig<<1 ) ) {
4989 return commonNaNToFloat64(floatx80ToCommonNaN(a, status), status);
4991 return packFloat64( aSign, 0x7FF, 0 );
4993 shift64RightJamming( aSig, 1, &zSig );
4994 if ( aExp || aSig ) aExp -= 0x3C01;
4995 return roundAndPackFloat64(aSign, aExp, zSig, status);
4999 /*----------------------------------------------------------------------------
5000 | Returns the result of converting the extended double-precision floating-
5001 | point value `a' to the quadruple-precision floating-point format. The
5002 | conversion is performed according to the IEC/IEEE Standard for Binary
5003 | Floating-Point Arithmetic.
5004 *----------------------------------------------------------------------------*/
5006 float128 floatx80_to_float128(floatx80 a, float_status *status)
5010 uint64_t aSig, zSig0, zSig1;
5012 aSig = extractFloatx80Frac( a );
5013 aExp = extractFloatx80Exp( a );
5014 aSign = extractFloatx80Sign( a );
5015 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
5016 return commonNaNToFloat128(floatx80ToCommonNaN(a, status), status);
5018 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
5019 return packFloat128( aSign, aExp, zSig0, zSig1 );
5023 /*----------------------------------------------------------------------------
5024 | Rounds the extended double-precision floating-point value `a' to an integer,
5025 | and returns the result as an extended quadruple-precision floating-point
5026 | value. The operation is performed according to the IEC/IEEE Standard for
5027 | Binary Floating-Point Arithmetic.
5028 *----------------------------------------------------------------------------*/
5030 floatx80 floatx80_round_to_int(floatx80 a, float_status *status)
5034 uint64_t lastBitMask, roundBitsMask;
5037 aExp = extractFloatx80Exp( a );
5038 if ( 0x403E <= aExp ) {
5039 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
5040 return propagateFloatx80NaN(a, a, status);
5044 if ( aExp < 0x3FFF ) {
5046 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
5049 status->float_exception_flags |= float_flag_inexact;
5050 aSign = extractFloatx80Sign( a );
5051 switch (status->float_rounding_mode) {
5052 case float_round_nearest_even:
5053 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
5056 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
5059 case float_round_ties_away:
5060 if (aExp == 0x3FFE) {
5061 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
5064 case float_round_down:
5067 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
5068 : packFloatx80( 0, 0, 0 );
5069 case float_round_up:
5071 aSign ? packFloatx80( 1, 0, 0 )
5072 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
5074 return packFloatx80( aSign, 0, 0 );
5077 lastBitMask <<= 0x403E - aExp;
5078 roundBitsMask = lastBitMask - 1;
5080 switch (status->float_rounding_mode) {
5081 case float_round_nearest_even:
5082 z.low += lastBitMask>>1;
5083 if ((z.low & roundBitsMask) == 0) {
5084 z.low &= ~lastBitMask;
5087 case float_round_ties_away:
5088 z.low += lastBitMask >> 1;
5090 case float_round_to_zero:
5092 case float_round_up:
5093 if (!extractFloatx80Sign(z)) {
5094 z.low += roundBitsMask;
5097 case float_round_down:
5098 if (extractFloatx80Sign(z)) {
5099 z.low += roundBitsMask;
5105 z.low &= ~ roundBitsMask;
5108 z.low = LIT64( 0x8000000000000000 );
5110 if (z.low != a.low) {
5111 status->float_exception_flags |= float_flag_inexact;
5117 /*----------------------------------------------------------------------------
5118 | Returns the result of adding the absolute values of the extended double-
5119 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
5120 | negated before being returned. `zSign' is ignored if the result is a NaN.
5121 | The addition is performed according to the IEC/IEEE Standard for Binary
5122 | Floating-Point Arithmetic.
5123 *----------------------------------------------------------------------------*/
5125 static floatx80 addFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5126 float_status *status)
5128 int32 aExp, bExp, zExp;
5129 uint64_t aSig, bSig, zSig0, zSig1;
5132 aSig = extractFloatx80Frac( a );
5133 aExp = extractFloatx80Exp( a );
5134 bSig = extractFloatx80Frac( b );
5135 bExp = extractFloatx80Exp( b );
5136 expDiff = aExp - bExp;
5137 if ( 0 < expDiff ) {
5138 if ( aExp == 0x7FFF ) {
5139 if ((uint64_t)(aSig << 1)) {
5140 return propagateFloatx80NaN(a, b, status);
5144 if ( bExp == 0 ) --expDiff;
5145 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5148 else if ( expDiff < 0 ) {
5149 if ( bExp == 0x7FFF ) {
5150 if ((uint64_t)(bSig << 1)) {
5151 return propagateFloatx80NaN(a, b, status);
5153 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5155 if ( aExp == 0 ) ++expDiff;
5156 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5160 if ( aExp == 0x7FFF ) {
5161 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5162 return propagateFloatx80NaN(a, b, status);
5167 zSig0 = aSig + bSig;
5169 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5175 zSig0 = aSig + bSig;
5176 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
5178 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5179 zSig0 |= LIT64( 0x8000000000000000 );
5182 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5183 zSign, zExp, zSig0, zSig1, status);
5186 /*----------------------------------------------------------------------------
5187 | Returns the result of subtracting the absolute values of the extended
5188 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5189 | difference is negated before being returned. `zSign' is ignored if the
5190 | result is a NaN. The subtraction is performed according to the IEC/IEEE
5191 | Standard for Binary Floating-Point Arithmetic.
5192 *----------------------------------------------------------------------------*/
5194 static floatx80 subFloatx80Sigs(floatx80 a, floatx80 b, flag zSign,
5195 float_status *status)
5197 int32 aExp, bExp, zExp;
5198 uint64_t aSig, bSig, zSig0, zSig1;
5202 aSig = extractFloatx80Frac( a );
5203 aExp = extractFloatx80Exp( a );
5204 bSig = extractFloatx80Frac( b );
5205 bExp = extractFloatx80Exp( b );
5206 expDiff = aExp - bExp;
5207 if ( 0 < expDiff ) goto aExpBigger;
5208 if ( expDiff < 0 ) goto bExpBigger;
5209 if ( aExp == 0x7FFF ) {
5210 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
5211 return propagateFloatx80NaN(a, b, status);
5213 float_raise(float_flag_invalid, status);
5214 z.low = floatx80_default_nan_low;
5215 z.high = floatx80_default_nan_high;
5223 if ( bSig < aSig ) goto aBigger;
5224 if ( aSig < bSig ) goto bBigger;
5225 return packFloatx80(status->float_rounding_mode == float_round_down, 0, 0);
5227 if ( bExp == 0x7FFF ) {
5228 if ((uint64_t)(bSig << 1)) {
5229 return propagateFloatx80NaN(a, b, status);
5231 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5233 if ( aExp == 0 ) ++expDiff;
5234 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5236 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5239 goto normalizeRoundAndPack;
5241 if ( aExp == 0x7FFF ) {
5242 if ((uint64_t)(aSig << 1)) {
5243 return propagateFloatx80NaN(a, b, status);
5247 if ( bExp == 0 ) --expDiff;
5248 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5250 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5252 normalizeRoundAndPack:
5253 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
5254 zSign, zExp, zSig0, zSig1, status);
5257 /*----------------------------------------------------------------------------
5258 | Returns the result of adding the extended double-precision floating-point
5259 | values `a' and `b'. The operation is performed according to the IEC/IEEE
5260 | Standard for Binary Floating-Point Arithmetic.
5261 *----------------------------------------------------------------------------*/
5263 floatx80 floatx80_add(floatx80 a, floatx80 b, float_status *status)
5267 aSign = extractFloatx80Sign( a );
5268 bSign = extractFloatx80Sign( b );
5269 if ( aSign == bSign ) {
5270 return addFloatx80Sigs(a, b, aSign, status);
5273 return subFloatx80Sigs(a, b, aSign, status);
5278 /*----------------------------------------------------------------------------
5279 | Returns the result of subtracting the extended double-precision floating-
5280 | point values `a' and `b'. The operation is performed according to the
5281 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5282 *----------------------------------------------------------------------------*/
5284 floatx80 floatx80_sub(floatx80 a, floatx80 b, float_status *status)
5288 aSign = extractFloatx80Sign( a );
5289 bSign = extractFloatx80Sign( b );
5290 if ( aSign == bSign ) {
5291 return subFloatx80Sigs(a, b, aSign, status);
5294 return addFloatx80Sigs(a, b, aSign, status);
5299 /*----------------------------------------------------------------------------
5300 | Returns the result of multiplying the extended double-precision floating-
5301 | point values `a' and `b'. The operation is performed according to the
5302 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5303 *----------------------------------------------------------------------------*/
5305 floatx80 floatx80_mul(floatx80 a, floatx80 b, float_status *status)
5307 flag aSign, bSign, zSign;
5308 int32 aExp, bExp, zExp;
5309 uint64_t aSig, bSig, zSig0, zSig1;
5312 aSig = extractFloatx80Frac( a );
5313 aExp = extractFloatx80Exp( a );
5314 aSign = extractFloatx80Sign( a );
5315 bSig = extractFloatx80Frac( b );
5316 bExp = extractFloatx80Exp( b );
5317 bSign = extractFloatx80Sign( b );
5318 zSign = aSign ^ bSign;
5319 if ( aExp == 0x7FFF ) {
5320 if ( (uint64_t) ( aSig<<1 )
5321 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5322 return propagateFloatx80NaN(a, b, status);
5324 if ( ( bExp | bSig ) == 0 ) goto invalid;
5325 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5327 if ( bExp == 0x7FFF ) {
5328 if ((uint64_t)(bSig << 1)) {
5329 return propagateFloatx80NaN(a, b, status);
5331 if ( ( aExp | aSig ) == 0 ) {
5333 float_raise(float_flag_invalid, status);
5334 z.low = floatx80_default_nan_low;
5335 z.high = floatx80_default_nan_high;
5338 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5341 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5342 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5345 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5346 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5348 zExp = aExp + bExp - 0x3FFE;
5349 mul64To128( aSig, bSig, &zSig0, &zSig1 );
5350 if ( 0 < (int64_t) zSig0 ) {
5351 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5354 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5355 zSign, zExp, zSig0, zSig1, status);
5358 /*----------------------------------------------------------------------------
5359 | Returns the result of dividing the extended double-precision floating-point
5360 | value `a' by the corresponding value `b'. The operation is performed
5361 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5362 *----------------------------------------------------------------------------*/
5364 floatx80 floatx80_div(floatx80 a, floatx80 b, float_status *status)
5366 flag aSign, bSign, zSign;
5367 int32 aExp, bExp, zExp;
5368 uint64_t aSig, bSig, zSig0, zSig1;
5369 uint64_t rem0, rem1, rem2, term0, term1, term2;
5372 aSig = extractFloatx80Frac( a );
5373 aExp = extractFloatx80Exp( a );
5374 aSign = extractFloatx80Sign( a );
5375 bSig = extractFloatx80Frac( b );
5376 bExp = extractFloatx80Exp( b );
5377 bSign = extractFloatx80Sign( b );
5378 zSign = aSign ^ bSign;
5379 if ( aExp == 0x7FFF ) {
5380 if ((uint64_t)(aSig << 1)) {
5381 return propagateFloatx80NaN(a, b, status);
5383 if ( bExp == 0x7FFF ) {
5384 if ((uint64_t)(bSig << 1)) {
5385 return propagateFloatx80NaN(a, b, status);
5389 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5391 if ( bExp == 0x7FFF ) {
5392 if ((uint64_t)(bSig << 1)) {
5393 return propagateFloatx80NaN(a, b, status);
5395 return packFloatx80( zSign, 0, 0 );
5399 if ( ( aExp | aSig ) == 0 ) {
5401 float_raise(float_flag_invalid, status);
5402 z.low = floatx80_default_nan_low;
5403 z.high = floatx80_default_nan_high;
5406 float_raise(float_flag_divbyzero, status);
5407 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5409 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5412 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5413 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5415 zExp = aExp - bExp + 0x3FFE;
5417 if ( bSig <= aSig ) {
5418 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5421 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5422 mul64To128( bSig, zSig0, &term0, &term1 );
5423 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
5424 while ( (int64_t) rem0 < 0 ) {
5426 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5428 zSig1 = estimateDiv128To64( rem1, 0, bSig );
5429 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
5430 mul64To128( bSig, zSig1, &term1, &term2 );
5431 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5432 while ( (int64_t) rem1 < 0 ) {
5434 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5436 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5438 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5439 zSign, zExp, zSig0, zSig1, status);
5442 /*----------------------------------------------------------------------------
5443 | Returns the remainder of the extended double-precision floating-point value
5444 | `a' with respect to the corresponding value `b'. The operation is performed
5445 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5446 *----------------------------------------------------------------------------*/
5448 floatx80 floatx80_rem(floatx80 a, floatx80 b, float_status *status)
5451 int32 aExp, bExp, expDiff;
5452 uint64_t aSig0, aSig1, bSig;
5453 uint64_t q, term0, term1, alternateASig0, alternateASig1;
5456 aSig0 = extractFloatx80Frac( a );
5457 aExp = extractFloatx80Exp( a );
5458 aSign = extractFloatx80Sign( a );
5459 bSig = extractFloatx80Frac( b );
5460 bExp = extractFloatx80Exp( b );
5461 if ( aExp == 0x7FFF ) {
5462 if ( (uint64_t) ( aSig0<<1 )
5463 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
5464 return propagateFloatx80NaN(a, b, status);
5468 if ( bExp == 0x7FFF ) {
5469 if ((uint64_t)(bSig << 1)) {
5470 return propagateFloatx80NaN(a, b, status);
5477 float_raise(float_flag_invalid, status);
5478 z.low = floatx80_default_nan_low;
5479 z.high = floatx80_default_nan_high;
5482 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5485 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
5486 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5488 bSig |= LIT64( 0x8000000000000000 );
5490 expDiff = aExp - bExp;
5492 if ( expDiff < 0 ) {
5493 if ( expDiff < -1 ) return a;
5494 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5497 q = ( bSig <= aSig0 );
5498 if ( q ) aSig0 -= bSig;
5500 while ( 0 < expDiff ) {
5501 q = estimateDiv128To64( aSig0, aSig1, bSig );
5502 q = ( 2 < q ) ? q - 2 : 0;
5503 mul64To128( bSig, q, &term0, &term1 );
5504 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5505 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5509 if ( 0 < expDiff ) {
5510 q = estimateDiv128To64( aSig0, aSig1, bSig );
5511 q = ( 2 < q ) ? q - 2 : 0;
5513 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5514 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5515 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5516 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5518 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5525 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5526 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5527 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5530 aSig0 = alternateASig0;
5531 aSig1 = alternateASig1;
5535 normalizeRoundAndPackFloatx80(
5536 80, zSign, bExp + expDiff, aSig0, aSig1, status);
5540 /*----------------------------------------------------------------------------
5541 | Returns the square root of the extended double-precision floating-point
5542 | value `a'. The operation is performed according to the IEC/IEEE Standard
5543 | for Binary Floating-Point Arithmetic.
5544 *----------------------------------------------------------------------------*/
5546 floatx80 floatx80_sqrt(floatx80 a, float_status *status)
5550 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5551 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
5554 aSig0 = extractFloatx80Frac( a );
5555 aExp = extractFloatx80Exp( a );
5556 aSign = extractFloatx80Sign( a );
5557 if ( aExp == 0x7FFF ) {
5558 if ((uint64_t)(aSig0 << 1)) {
5559 return propagateFloatx80NaN(a, a, status);
5561 if ( ! aSign ) return a;
5565 if ( ( aExp | aSig0 ) == 0 ) return a;
5567 float_raise(float_flag_invalid, status);
5568 z.low = floatx80_default_nan_low;
5569 z.high = floatx80_default_nan_high;
5573 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5574 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5576 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5577 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5578 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5579 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5580 doubleZSig0 = zSig0<<1;
5581 mul64To128( zSig0, zSig0, &term0, &term1 );
5582 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
5583 while ( (int64_t) rem0 < 0 ) {
5586 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5588 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5589 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5590 if ( zSig1 == 0 ) zSig1 = 1;
5591 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5592 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5593 mul64To128( zSig1, zSig1, &term2, &term3 );
5594 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
5595 while ( (int64_t) rem1 < 0 ) {
5597 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5599 term2 |= doubleZSig0;
5600 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5602 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5604 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5605 zSig0 |= doubleZSig0;
5606 return roundAndPackFloatx80(status->floatx80_rounding_precision,
5607 0, zExp, zSig0, zSig1, status);
5610 /*----------------------------------------------------------------------------
5611 | Returns 1 if the extended double-precision floating-point value `a' is equal
5612 | to the corresponding value `b', and 0 otherwise. The invalid exception is
5613 | raised if either operand is a NaN. Otherwise, the comparison is performed
5614 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5615 *----------------------------------------------------------------------------*/
5617 int floatx80_eq(floatx80 a, floatx80 b, float_status *status)
5620 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5621 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5622 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5623 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5625 float_raise(float_flag_invalid, status);
5630 && ( ( a.high == b.high )
5632 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5637 /*----------------------------------------------------------------------------
5638 | Returns 1 if the extended double-precision floating-point value `a' is
5639 | less than or equal to the corresponding value `b', and 0 otherwise. The
5640 | invalid exception is raised if either operand is a NaN. The comparison is
5641 | performed according to the IEC/IEEE Standard for Binary Floating-Point
5643 *----------------------------------------------------------------------------*/
5645 int floatx80_le(floatx80 a, floatx80 b, float_status *status)
5649 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5650 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5651 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5652 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5654 float_raise(float_flag_invalid, status);
5657 aSign = extractFloatx80Sign( a );
5658 bSign = extractFloatx80Sign( b );
5659 if ( aSign != bSign ) {
5662 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5666 aSign ? le128( b.high, b.low, a.high, a.low )
5667 : le128( a.high, a.low, b.high, b.low );
5671 /*----------------------------------------------------------------------------
5672 | Returns 1 if the extended double-precision floating-point value `a' is
5673 | less than the corresponding value `b', and 0 otherwise. The invalid
5674 | exception is raised if either operand is a NaN. The comparison is performed
5675 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5676 *----------------------------------------------------------------------------*/
5678 int floatx80_lt(floatx80 a, floatx80 b, float_status *status)
5682 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5683 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5684 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5685 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5687 float_raise(float_flag_invalid, status);
5690 aSign = extractFloatx80Sign( a );
5691 bSign = extractFloatx80Sign( b );
5692 if ( aSign != bSign ) {
5695 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5699 aSign ? lt128( b.high, b.low, a.high, a.low )
5700 : lt128( a.high, a.low, b.high, b.low );
5704 /*----------------------------------------------------------------------------
5705 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5706 | cannot be compared, and 0 otherwise. The invalid exception is raised if
5707 | either operand is a NaN. The comparison is performed according to the
5708 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5709 *----------------------------------------------------------------------------*/
5710 int floatx80_unordered(floatx80 a, floatx80 b, float_status *status)
5712 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5713 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5714 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5715 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5717 float_raise(float_flag_invalid, status);
5723 /*----------------------------------------------------------------------------
5724 | Returns 1 if the extended double-precision floating-point value `a' is
5725 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5726 | cause an exception. The comparison is performed according to the IEC/IEEE
5727 | Standard for Binary Floating-Point Arithmetic.
5728 *----------------------------------------------------------------------------*/
5730 int floatx80_eq_quiet(floatx80 a, floatx80 b, float_status *status)
5733 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5734 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5735 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5736 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5738 if ( floatx80_is_signaling_nan( a )
5739 || floatx80_is_signaling_nan( b ) ) {
5740 float_raise(float_flag_invalid, status);
5746 && ( ( a.high == b.high )
5748 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
5753 /*----------------------------------------------------------------------------
5754 | Returns 1 if the extended double-precision floating-point value `a' is less
5755 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5756 | do not cause an exception. Otherwise, the comparison is performed according
5757 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5758 *----------------------------------------------------------------------------*/
5760 int floatx80_le_quiet(floatx80 a, floatx80 b, float_status *status)
5764 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5765 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5766 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5767 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5769 if ( floatx80_is_signaling_nan( a )
5770 || floatx80_is_signaling_nan( b ) ) {
5771 float_raise(float_flag_invalid, status);
5775 aSign = extractFloatx80Sign( a );
5776 bSign = extractFloatx80Sign( b );
5777 if ( aSign != bSign ) {
5780 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5784 aSign ? le128( b.high, b.low, a.high, a.low )
5785 : le128( a.high, a.low, b.high, b.low );
5789 /*----------------------------------------------------------------------------
5790 | Returns 1 if the extended double-precision floating-point value `a' is less
5791 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5792 | an exception. Otherwise, the comparison is performed according to the
5793 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5794 *----------------------------------------------------------------------------*/
5796 int floatx80_lt_quiet(floatx80 a, floatx80 b, float_status *status)
5800 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5801 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5802 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5803 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5805 if ( floatx80_is_signaling_nan( a )
5806 || floatx80_is_signaling_nan( b ) ) {
5807 float_raise(float_flag_invalid, status);
5811 aSign = extractFloatx80Sign( a );
5812 bSign = extractFloatx80Sign( b );
5813 if ( aSign != bSign ) {
5816 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
5820 aSign ? lt128( b.high, b.low, a.high, a.low )
5821 : lt128( a.high, a.low, b.high, b.low );
5825 /*----------------------------------------------------------------------------
5826 | Returns 1 if the extended double-precision floating-point values `a' and `b'
5827 | cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5828 | The comparison is performed according to the IEC/IEEE Standard for Binary
5829 | Floating-Point Arithmetic.
5830 *----------------------------------------------------------------------------*/
5831 int floatx80_unordered_quiet(floatx80 a, floatx80 b, float_status *status)
5833 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5834 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5835 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5836 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5838 if ( floatx80_is_signaling_nan( a )
5839 || floatx80_is_signaling_nan( b ) ) {
5840 float_raise(float_flag_invalid, status);
5847 /*----------------------------------------------------------------------------
5848 | Returns the result of converting the quadruple-precision floating-point
5849 | value `a' to the 32-bit two's complement integer format. The conversion
5850 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5851 | Arithmetic---which means in particular that the conversion is rounded
5852 | according to the current rounding mode. If `a' is a NaN, the largest
5853 | positive integer is returned. Otherwise, if the conversion overflows, the
5854 | largest integer with the same sign as `a' is returned.
5855 *----------------------------------------------------------------------------*/
5857 int32 float128_to_int32(float128 a, float_status *status)
5860 int32 aExp, shiftCount;
5861 uint64_t aSig0, aSig1;
5863 aSig1 = extractFloat128Frac1( a );
5864 aSig0 = extractFloat128Frac0( a );
5865 aExp = extractFloat128Exp( a );
5866 aSign = extractFloat128Sign( a );
5867 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5868 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5869 aSig0 |= ( aSig1 != 0 );
5870 shiftCount = 0x4028 - aExp;
5871 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5872 return roundAndPackInt32(aSign, aSig0, status);
5876 /*----------------------------------------------------------------------------
5877 | Returns the result of converting the quadruple-precision floating-point
5878 | value `a' to the 32-bit two's complement integer format. The conversion
5879 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5880 | Arithmetic, except that the conversion is always rounded toward zero. If
5881 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5882 | conversion overflows, the largest integer with the same sign as `a' is
5884 *----------------------------------------------------------------------------*/
5886 int32 float128_to_int32_round_to_zero(float128 a, float_status *status)
5889 int32 aExp, shiftCount;
5890 uint64_t aSig0, aSig1, savedASig;
5893 aSig1 = extractFloat128Frac1( a );
5894 aSig0 = extractFloat128Frac0( a );
5895 aExp = extractFloat128Exp( a );
5896 aSign = extractFloat128Sign( a );
5897 aSig0 |= ( aSig1 != 0 );
5898 if ( 0x401E < aExp ) {
5899 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5902 else if ( aExp < 0x3FFF ) {
5903 if (aExp || aSig0) {
5904 status->float_exception_flags |= float_flag_inexact;
5908 aSig0 |= LIT64( 0x0001000000000000 );
5909 shiftCount = 0x402F - aExp;
5911 aSig0 >>= shiftCount;
5913 if ( aSign ) z = - z;
5914 if ( ( z < 0 ) ^ aSign ) {
5916 float_raise(float_flag_invalid, status);
5917 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
5919 if ( ( aSig0<<shiftCount ) != savedASig ) {
5920 status->float_exception_flags |= float_flag_inexact;
5926 /*----------------------------------------------------------------------------
5927 | Returns the result of converting the quadruple-precision floating-point
5928 | value `a' to the 64-bit two's complement integer format. The conversion
5929 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5930 | Arithmetic---which means in particular that the conversion is rounded
5931 | according to the current rounding mode. If `a' is a NaN, the largest
5932 | positive integer is returned. Otherwise, if the conversion overflows, the
5933 | largest integer with the same sign as `a' is returned.
5934 *----------------------------------------------------------------------------*/
5936 int64 float128_to_int64(float128 a, float_status *status)
5939 int32 aExp, shiftCount;
5940 uint64_t aSig0, aSig1;
5942 aSig1 = extractFloat128Frac1( a );
5943 aSig0 = extractFloat128Frac0( a );
5944 aExp = extractFloat128Exp( a );
5945 aSign = extractFloat128Sign( a );
5946 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5947 shiftCount = 0x402F - aExp;
5948 if ( shiftCount <= 0 ) {
5949 if ( 0x403E < aExp ) {
5950 float_raise(float_flag_invalid, status);
5952 || ( ( aExp == 0x7FFF )
5953 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5956 return LIT64( 0x7FFFFFFFFFFFFFFF );
5958 return (int64_t) LIT64( 0x8000000000000000 );
5960 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5963 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5965 return roundAndPackInt64(aSign, aSig0, aSig1, status);
5969 /*----------------------------------------------------------------------------
5970 | Returns the result of converting the quadruple-precision floating-point
5971 | value `a' to the 64-bit two's complement integer format. The conversion
5972 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
5973 | Arithmetic, except that the conversion is always rounded toward zero.
5974 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5975 | the conversion overflows, the largest integer with the same sign as `a' is
5977 *----------------------------------------------------------------------------*/
5979 int64 float128_to_int64_round_to_zero(float128 a, float_status *status)
5982 int32 aExp, shiftCount;
5983 uint64_t aSig0, aSig1;
5986 aSig1 = extractFloat128Frac1( a );
5987 aSig0 = extractFloat128Frac0( a );
5988 aExp = extractFloat128Exp( a );
5989 aSign = extractFloat128Sign( a );
5990 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5991 shiftCount = aExp - 0x402F;
5992 if ( 0 < shiftCount ) {
5993 if ( 0x403E <= aExp ) {
5994 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5995 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5996 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5998 status->float_exception_flags |= float_flag_inexact;
6002 float_raise(float_flag_invalid, status);
6003 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
6004 return LIT64( 0x7FFFFFFFFFFFFFFF );
6007 return (int64_t) LIT64( 0x8000000000000000 );
6009 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
6010 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
6011 status->float_exception_flags |= float_flag_inexact;
6015 if ( aExp < 0x3FFF ) {
6016 if ( aExp | aSig0 | aSig1 ) {
6017 status->float_exception_flags |= float_flag_inexact;
6021 z = aSig0>>( - shiftCount );
6023 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
6024 status->float_exception_flags |= float_flag_inexact;
6027 if ( aSign ) z = - z;
6032 /*----------------------------------------------------------------------------
6033 | Returns the result of converting the quadruple-precision floating-point
6034 | value `a' to the single-precision floating-point format. The conversion
6035 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6037 *----------------------------------------------------------------------------*/
6039 float32 float128_to_float32(float128 a, float_status *status)
6043 uint64_t aSig0, aSig1;
6046 aSig1 = extractFloat128Frac1( a );
6047 aSig0 = extractFloat128Frac0( a );
6048 aExp = extractFloat128Exp( a );
6049 aSign = extractFloat128Sign( a );
6050 if ( aExp == 0x7FFF ) {
6051 if ( aSig0 | aSig1 ) {
6052 return commonNaNToFloat32(float128ToCommonNaN(a, status), status);
6054 return packFloat32( aSign, 0xFF, 0 );
6056 aSig0 |= ( aSig1 != 0 );
6057 shift64RightJamming( aSig0, 18, &aSig0 );
6059 if ( aExp || zSig ) {
6063 return roundAndPackFloat32(aSign, aExp, zSig, status);
6067 /*----------------------------------------------------------------------------
6068 | Returns the result of converting the quadruple-precision floating-point
6069 | value `a' to the double-precision floating-point format. The conversion
6070 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
6072 *----------------------------------------------------------------------------*/
6074 float64 float128_to_float64(float128 a, float_status *status)
6078 uint64_t aSig0, aSig1;
6080 aSig1 = extractFloat128Frac1( a );
6081 aSig0 = extractFloat128Frac0( a );
6082 aExp = extractFloat128Exp( a );
6083 aSign = extractFloat128Sign( a );
6084 if ( aExp == 0x7FFF ) {
6085 if ( aSig0 | aSig1 ) {
6086 return commonNaNToFloat64(float128ToCommonNaN(a, status), status);
6088 return packFloat64( aSign, 0x7FF, 0 );
6090 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6091 aSig0 |= ( aSig1 != 0 );
6092 if ( aExp || aSig0 ) {
6093 aSig0 |= LIT64( 0x4000000000000000 );
6096 return roundAndPackFloat64(aSign, aExp, aSig0, status);
6100 /*----------------------------------------------------------------------------
6101 | Returns the result of converting the quadruple-precision floating-point
6102 | value `a' to the extended double-precision floating-point format. The
6103 | conversion is performed according to the IEC/IEEE Standard for Binary
6104 | Floating-Point Arithmetic.
6105 *----------------------------------------------------------------------------*/
6107 floatx80 float128_to_floatx80(float128 a, float_status *status)
6111 uint64_t aSig0, aSig1;
6113 aSig1 = extractFloat128Frac1( a );
6114 aSig0 = extractFloat128Frac0( a );
6115 aExp = extractFloat128Exp( a );
6116 aSign = extractFloat128Sign( a );
6117 if ( aExp == 0x7FFF ) {
6118 if ( aSig0 | aSig1 ) {
6119 return commonNaNToFloatx80(float128ToCommonNaN(a, status), status);
6121 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
6124 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
6125 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6128 aSig0 |= LIT64( 0x0001000000000000 );
6130 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
6131 return roundAndPackFloatx80(80, aSign, aExp, aSig0, aSig1, status);
6135 /*----------------------------------------------------------------------------
6136 | Rounds the quadruple-precision floating-point value `a' to an integer, and
6137 | returns the result as a quadruple-precision floating-point value. The
6138 | operation is performed according to the IEC/IEEE Standard for Binary
6139 | Floating-Point Arithmetic.
6140 *----------------------------------------------------------------------------*/
6142 float128 float128_round_to_int(float128 a, float_status *status)
6146 uint64_t lastBitMask, roundBitsMask;
6149 aExp = extractFloat128Exp( a );
6150 if ( 0x402F <= aExp ) {
6151 if ( 0x406F <= aExp ) {
6152 if ( ( aExp == 0x7FFF )
6153 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6155 return propagateFloat128NaN(a, a, status);
6160 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6161 roundBitsMask = lastBitMask - 1;
6163 switch (status->float_rounding_mode) {
6164 case float_round_nearest_even:
6165 if ( lastBitMask ) {
6166 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6167 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6170 if ( (int64_t) z.low < 0 ) {
6172 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
6176 case float_round_ties_away:
6178 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6180 if ((int64_t) z.low < 0) {
6185 case float_round_to_zero:
6187 case float_round_up:
6188 if (!extractFloat128Sign(z)) {
6189 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6192 case float_round_down:
6193 if (extractFloat128Sign(z)) {
6194 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6200 z.low &= ~ roundBitsMask;
6203 if ( aExp < 0x3FFF ) {
6204 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
6205 status->float_exception_flags |= float_flag_inexact;
6206 aSign = extractFloat128Sign( a );
6207 switch (status->float_rounding_mode) {
6208 case float_round_nearest_even:
6209 if ( ( aExp == 0x3FFE )
6210 && ( extractFloat128Frac0( a )
6211 | extractFloat128Frac1( a ) )
6213 return packFloat128( aSign, 0x3FFF, 0, 0 );
6216 case float_round_ties_away:
6217 if (aExp == 0x3FFE) {
6218 return packFloat128(aSign, 0x3FFF, 0, 0);
6221 case float_round_down:
6223 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6224 : packFloat128( 0, 0, 0, 0 );
6225 case float_round_up:
6227 aSign ? packFloat128( 1, 0, 0, 0 )
6228 : packFloat128( 0, 0x3FFF, 0, 0 );
6230 return packFloat128( aSign, 0, 0, 0 );
6233 lastBitMask <<= 0x402F - aExp;
6234 roundBitsMask = lastBitMask - 1;
6237 switch (status->float_rounding_mode) {
6238 case float_round_nearest_even:
6239 z.high += lastBitMask>>1;
6240 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6241 z.high &= ~ lastBitMask;
6244 case float_round_ties_away:
6245 z.high += lastBitMask>>1;
6247 case float_round_to_zero:
6249 case float_round_up:
6250 if (!extractFloat128Sign(z)) {
6251 z.high |= ( a.low != 0 );
6252 z.high += roundBitsMask;
6255 case float_round_down:
6256 if (extractFloat128Sign(z)) {
6257 z.high |= (a.low != 0);
6258 z.high += roundBitsMask;
6264 z.high &= ~ roundBitsMask;
6266 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6267 status->float_exception_flags |= float_flag_inexact;
6273 /*----------------------------------------------------------------------------
6274 | Returns the result of adding the absolute values of the quadruple-precision
6275 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6276 | before being returned. `zSign' is ignored if the result is a NaN.
6277 | The addition is performed according to the IEC/IEEE Standard for Binary
6278 | Floating-Point Arithmetic.
6279 *----------------------------------------------------------------------------*/
6281 static float128 addFloat128Sigs(float128 a, float128 b, flag zSign,
6282 float_status *status)
6284 int32 aExp, bExp, zExp;
6285 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6288 aSig1 = extractFloat128Frac1( a );
6289 aSig0 = extractFloat128Frac0( a );
6290 aExp = extractFloat128Exp( a );
6291 bSig1 = extractFloat128Frac1( b );
6292 bSig0 = extractFloat128Frac0( b );
6293 bExp = extractFloat128Exp( b );
6294 expDiff = aExp - bExp;
6295 if ( 0 < expDiff ) {
6296 if ( aExp == 0x7FFF ) {
6297 if (aSig0 | aSig1) {
6298 return propagateFloat128NaN(a, b, status);
6306 bSig0 |= LIT64( 0x0001000000000000 );
6308 shift128ExtraRightJamming(
6309 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6312 else if ( expDiff < 0 ) {
6313 if ( bExp == 0x7FFF ) {
6314 if (bSig0 | bSig1) {
6315 return propagateFloat128NaN(a, b, status);
6317 return packFloat128( zSign, 0x7FFF, 0, 0 );
6323 aSig0 |= LIT64( 0x0001000000000000 );
6325 shift128ExtraRightJamming(
6326 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6330 if ( aExp == 0x7FFF ) {
6331 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6332 return propagateFloat128NaN(a, b, status);
6336 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6338 if (status->flush_to_zero) {
6339 if (zSig0 | zSig1) {
6340 float_raise(float_flag_output_denormal, status);
6342 return packFloat128(zSign, 0, 0, 0);
6344 return packFloat128( zSign, 0, zSig0, zSig1 );
6347 zSig0 |= LIT64( 0x0002000000000000 );
6351 aSig0 |= LIT64( 0x0001000000000000 );
6352 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6354 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6357 shift128ExtraRightJamming(
6358 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6360 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6364 /*----------------------------------------------------------------------------
6365 | Returns the result of subtracting the absolute values of the quadruple-
6366 | precision floating-point values `a' and `b'. If `zSign' is 1, the
6367 | difference is negated before being returned. `zSign' is ignored if the
6368 | result is a NaN. The subtraction is performed according to the IEC/IEEE
6369 | Standard for Binary Floating-Point Arithmetic.
6370 *----------------------------------------------------------------------------*/
6372 static float128 subFloat128Sigs(float128 a, float128 b, flag zSign,
6373 float_status *status)
6375 int32 aExp, bExp, zExp;
6376 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
6380 aSig1 = extractFloat128Frac1( a );
6381 aSig0 = extractFloat128Frac0( a );
6382 aExp = extractFloat128Exp( a );
6383 bSig1 = extractFloat128Frac1( b );
6384 bSig0 = extractFloat128Frac0( b );
6385 bExp = extractFloat128Exp( b );
6386 expDiff = aExp - bExp;
6387 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6388 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6389 if ( 0 < expDiff ) goto aExpBigger;
6390 if ( expDiff < 0 ) goto bExpBigger;
6391 if ( aExp == 0x7FFF ) {
6392 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6393 return propagateFloat128NaN(a, b, status);
6395 float_raise(float_flag_invalid, status);
6396 z.low = float128_default_nan_low;
6397 z.high = float128_default_nan_high;
6404 if ( bSig0 < aSig0 ) goto aBigger;
6405 if ( aSig0 < bSig0 ) goto bBigger;
6406 if ( bSig1 < aSig1 ) goto aBigger;
6407 if ( aSig1 < bSig1 ) goto bBigger;
6408 return packFloat128(status->float_rounding_mode == float_round_down,
6411 if ( bExp == 0x7FFF ) {
6412 if (bSig0 | bSig1) {
6413 return propagateFloat128NaN(a, b, status);
6415 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6421 aSig0 |= LIT64( 0x4000000000000000 );
6423 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6424 bSig0 |= LIT64( 0x4000000000000000 );
6426 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6429 goto normalizeRoundAndPack;
6431 if ( aExp == 0x7FFF ) {
6432 if (aSig0 | aSig1) {
6433 return propagateFloat128NaN(a, b, status);
6441 bSig0 |= LIT64( 0x4000000000000000 );
6443 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6444 aSig0 |= LIT64( 0x4000000000000000 );
6446 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6448 normalizeRoundAndPack:
6450 return normalizeRoundAndPackFloat128(zSign, zExp - 14, zSig0, zSig1,
6455 /*----------------------------------------------------------------------------
6456 | Returns the result of adding the quadruple-precision floating-point values
6457 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6458 | for Binary Floating-Point Arithmetic.
6459 *----------------------------------------------------------------------------*/
6461 float128 float128_add(float128 a, float128 b, float_status *status)
6465 aSign = extractFloat128Sign( a );
6466 bSign = extractFloat128Sign( b );
6467 if ( aSign == bSign ) {
6468 return addFloat128Sigs(a, b, aSign, status);
6471 return subFloat128Sigs(a, b, aSign, status);
6476 /*----------------------------------------------------------------------------
6477 | Returns the result of subtracting the quadruple-precision floating-point
6478 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6479 | Standard for Binary Floating-Point Arithmetic.
6480 *----------------------------------------------------------------------------*/
6482 float128 float128_sub(float128 a, float128 b, float_status *status)
6486 aSign = extractFloat128Sign( a );
6487 bSign = extractFloat128Sign( b );
6488 if ( aSign == bSign ) {
6489 return subFloat128Sigs(a, b, aSign, status);
6492 return addFloat128Sigs(a, b, aSign, status);
6497 /*----------------------------------------------------------------------------
6498 | Returns the result of multiplying the quadruple-precision floating-point
6499 | values `a' and `b'. The operation is performed according to the IEC/IEEE
6500 | Standard for Binary Floating-Point Arithmetic.
6501 *----------------------------------------------------------------------------*/
6503 float128 float128_mul(float128 a, float128 b, float_status *status)
6505 flag aSign, bSign, zSign;
6506 int32 aExp, bExp, zExp;
6507 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
6510 aSig1 = extractFloat128Frac1( a );
6511 aSig0 = extractFloat128Frac0( a );
6512 aExp = extractFloat128Exp( a );
6513 aSign = extractFloat128Sign( a );
6514 bSig1 = extractFloat128Frac1( b );
6515 bSig0 = extractFloat128Frac0( b );
6516 bExp = extractFloat128Exp( b );
6517 bSign = extractFloat128Sign( b );
6518 zSign = aSign ^ bSign;
6519 if ( aExp == 0x7FFF ) {
6520 if ( ( aSig0 | aSig1 )
6521 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6522 return propagateFloat128NaN(a, b, status);
6524 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6525 return packFloat128( zSign, 0x7FFF, 0, 0 );
6527 if ( bExp == 0x7FFF ) {
6528 if (bSig0 | bSig1) {
6529 return propagateFloat128NaN(a, b, status);
6531 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6533 float_raise(float_flag_invalid, status);
6534 z.low = float128_default_nan_low;
6535 z.high = float128_default_nan_high;
6538 return packFloat128( zSign, 0x7FFF, 0, 0 );
6541 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6542 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6545 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6546 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6548 zExp = aExp + bExp - 0x4000;
6549 aSig0 |= LIT64( 0x0001000000000000 );
6550 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6551 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6552 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6553 zSig2 |= ( zSig3 != 0 );
6554 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6555 shift128ExtraRightJamming(
6556 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6559 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6563 /*----------------------------------------------------------------------------
6564 | Returns the result of dividing the quadruple-precision floating-point value
6565 | `a' by the corresponding value `b'. The operation is performed according to
6566 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6567 *----------------------------------------------------------------------------*/
6569 float128 float128_div(float128 a, float128 b, float_status *status)
6571 flag aSign, bSign, zSign;
6572 int32 aExp, bExp, zExp;
6573 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6574 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6577 aSig1 = extractFloat128Frac1( a );
6578 aSig0 = extractFloat128Frac0( a );
6579 aExp = extractFloat128Exp( a );
6580 aSign = extractFloat128Sign( a );
6581 bSig1 = extractFloat128Frac1( b );
6582 bSig0 = extractFloat128Frac0( b );
6583 bExp = extractFloat128Exp( b );
6584 bSign = extractFloat128Sign( b );
6585 zSign = aSign ^ bSign;
6586 if ( aExp == 0x7FFF ) {
6587 if (aSig0 | aSig1) {
6588 return propagateFloat128NaN(a, b, status);
6590 if ( bExp == 0x7FFF ) {
6591 if (bSig0 | bSig1) {
6592 return propagateFloat128NaN(a, b, status);
6596 return packFloat128( zSign, 0x7FFF, 0, 0 );
6598 if ( bExp == 0x7FFF ) {
6599 if (bSig0 | bSig1) {
6600 return propagateFloat128NaN(a, b, status);
6602 return packFloat128( zSign, 0, 0, 0 );
6605 if ( ( bSig0 | bSig1 ) == 0 ) {
6606 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6608 float_raise(float_flag_invalid, status);
6609 z.low = float128_default_nan_low;
6610 z.high = float128_default_nan_high;
6613 float_raise(float_flag_divbyzero, status);
6614 return packFloat128( zSign, 0x7FFF, 0, 0 );
6616 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6619 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6620 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6622 zExp = aExp - bExp + 0x3FFD;
6624 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6626 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6627 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6628 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6631 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6632 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6633 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
6634 while ( (int64_t) rem0 < 0 ) {
6636 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6638 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6639 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6640 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6641 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
6642 while ( (int64_t) rem1 < 0 ) {
6644 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6646 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6648 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6649 return roundAndPackFloat128(zSign, zExp, zSig0, zSig1, zSig2, status);
6653 /*----------------------------------------------------------------------------
6654 | Returns the remainder of the quadruple-precision floating-point value `a'
6655 | with respect to the corresponding value `b'. The operation is performed
6656 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6657 *----------------------------------------------------------------------------*/
6659 float128 float128_rem(float128 a, float128 b, float_status *status)
6662 int32 aExp, bExp, expDiff;
6663 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6664 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6668 aSig1 = extractFloat128Frac1( a );
6669 aSig0 = extractFloat128Frac0( a );
6670 aExp = extractFloat128Exp( a );
6671 aSign = extractFloat128Sign( a );
6672 bSig1 = extractFloat128Frac1( b );
6673 bSig0 = extractFloat128Frac0( b );
6674 bExp = extractFloat128Exp( b );
6675 if ( aExp == 0x7FFF ) {
6676 if ( ( aSig0 | aSig1 )
6677 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6678 return propagateFloat128NaN(a, b, status);
6682 if ( bExp == 0x7FFF ) {
6683 if (bSig0 | bSig1) {
6684 return propagateFloat128NaN(a, b, status);
6689 if ( ( bSig0 | bSig1 ) == 0 ) {
6691 float_raise(float_flag_invalid, status);
6692 z.low = float128_default_nan_low;
6693 z.high = float128_default_nan_high;
6696 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6699 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6700 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6702 expDiff = aExp - bExp;
6703 if ( expDiff < -1 ) return a;
6705 aSig0 | LIT64( 0x0001000000000000 ),
6707 15 - ( expDiff < 0 ),
6712 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6713 q = le128( bSig0, bSig1, aSig0, aSig1 );
6714 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6716 while ( 0 < expDiff ) {
6717 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6718 q = ( 4 < q ) ? q - 4 : 0;
6719 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6720 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6721 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6722 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6725 if ( -64 < expDiff ) {
6726 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6727 q = ( 4 < q ) ? q - 4 : 0;
6729 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6731 if ( expDiff < 0 ) {
6732 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6735 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6737 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6738 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6741 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6742 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6745 alternateASig0 = aSig0;
6746 alternateASig1 = aSig1;
6748 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6749 } while ( 0 <= (int64_t) aSig0 );
6751 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
6752 if ( ( sigMean0 < 0 )
6753 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6754 aSig0 = alternateASig0;
6755 aSig1 = alternateASig1;
6757 zSign = ( (int64_t) aSig0 < 0 );
6758 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6759 return normalizeRoundAndPackFloat128(aSign ^ zSign, bExp - 4, aSig0, aSig1,
6763 /*----------------------------------------------------------------------------
6764 | Returns the square root of the quadruple-precision floating-point value `a'.
6765 | The operation is performed according to the IEC/IEEE Standard for Binary
6766 | Floating-Point Arithmetic.
6767 *----------------------------------------------------------------------------*/
6769 float128 float128_sqrt(float128 a, float_status *status)
6773 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6774 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
6777 aSig1 = extractFloat128Frac1( a );
6778 aSig0 = extractFloat128Frac0( a );
6779 aExp = extractFloat128Exp( a );
6780 aSign = extractFloat128Sign( a );
6781 if ( aExp == 0x7FFF ) {
6782 if (aSig0 | aSig1) {
6783 return propagateFloat128NaN(a, a, status);
6785 if ( ! aSign ) return a;
6789 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6791 float_raise(float_flag_invalid, status);
6792 z.low = float128_default_nan_low;
6793 z.high = float128_default_nan_high;
6797 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6798 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6800 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6801 aSig0 |= LIT64( 0x0001000000000000 );
6802 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6803 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6804 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6805 doubleZSig0 = zSig0<<1;
6806 mul64To128( zSig0, zSig0, &term0, &term1 );
6807 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
6808 while ( (int64_t) rem0 < 0 ) {
6811 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6813 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6814 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6815 if ( zSig1 == 0 ) zSig1 = 1;
6816 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6817 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6818 mul64To128( zSig1, zSig1, &term2, &term3 );
6819 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
6820 while ( (int64_t) rem1 < 0 ) {
6822 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6824 term2 |= doubleZSig0;
6825 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6827 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6829 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6830 return roundAndPackFloat128(0, zExp, zSig0, zSig1, zSig2, status);
6834 /*----------------------------------------------------------------------------
6835 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6836 | the corresponding value `b', and 0 otherwise. The invalid exception is
6837 | raised if either operand is a NaN. Otherwise, the comparison is performed
6838 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6839 *----------------------------------------------------------------------------*/
6841 int float128_eq(float128 a, float128 b, float_status *status)
6844 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6845 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6846 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6847 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6849 float_raise(float_flag_invalid, status);
6854 && ( ( a.high == b.high )
6856 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6861 /*----------------------------------------------------------------------------
6862 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6863 | or equal to the corresponding value `b', and 0 otherwise. The invalid
6864 | exception is raised if either operand is a NaN. The comparison is performed
6865 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6866 *----------------------------------------------------------------------------*/
6868 int float128_le(float128 a, float128 b, float_status *status)
6872 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6873 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6874 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6875 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6877 float_raise(float_flag_invalid, status);
6880 aSign = extractFloat128Sign( a );
6881 bSign = extractFloat128Sign( b );
6882 if ( aSign != bSign ) {
6885 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6889 aSign ? le128( b.high, b.low, a.high, a.low )
6890 : le128( a.high, a.low, b.high, b.low );
6894 /*----------------------------------------------------------------------------
6895 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6896 | the corresponding value `b', and 0 otherwise. The invalid exception is
6897 | raised if either operand is a NaN. The comparison is performed according
6898 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6899 *----------------------------------------------------------------------------*/
6901 int float128_lt(float128 a, float128 b, float_status *status)
6905 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6906 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6907 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6908 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6910 float_raise(float_flag_invalid, status);
6913 aSign = extractFloat128Sign( a );
6914 bSign = extractFloat128Sign( b );
6915 if ( aSign != bSign ) {
6918 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
6922 aSign ? lt128( b.high, b.low, a.high, a.low )
6923 : lt128( a.high, a.low, b.high, b.low );
6927 /*----------------------------------------------------------------------------
6928 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6929 | be compared, and 0 otherwise. The invalid exception is raised if either
6930 | operand is a NaN. The comparison is performed according to the IEC/IEEE
6931 | Standard for Binary Floating-Point Arithmetic.
6932 *----------------------------------------------------------------------------*/
6934 int float128_unordered(float128 a, float128 b, float_status *status)
6936 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6937 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6938 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6939 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6941 float_raise(float_flag_invalid, status);
6947 /*----------------------------------------------------------------------------
6948 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
6949 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6950 | exception. The comparison is performed according to the IEC/IEEE Standard
6951 | for Binary Floating-Point Arithmetic.
6952 *----------------------------------------------------------------------------*/
6954 int float128_eq_quiet(float128 a, float128 b, float_status *status)
6957 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6958 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6959 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6960 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6962 if ( float128_is_signaling_nan( a )
6963 || float128_is_signaling_nan( b ) ) {
6964 float_raise(float_flag_invalid, status);
6970 && ( ( a.high == b.high )
6972 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
6977 /*----------------------------------------------------------------------------
6978 | Returns 1 if the quadruple-precision floating-point value `a' is less than
6979 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6980 | cause an exception. Otherwise, the comparison is performed according to the
6981 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6982 *----------------------------------------------------------------------------*/
6984 int float128_le_quiet(float128 a, float128 b, float_status *status)
6988 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6989 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6990 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6991 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6993 if ( float128_is_signaling_nan( a )
6994 || float128_is_signaling_nan( b ) ) {
6995 float_raise(float_flag_invalid, status);
6999 aSign = extractFloat128Sign( a );
7000 bSign = extractFloat128Sign( b );
7001 if ( aSign != bSign ) {
7004 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7008 aSign ? le128( b.high, b.low, a.high, a.low )
7009 : le128( a.high, a.low, b.high, b.low );
7013 /*----------------------------------------------------------------------------
7014 | Returns 1 if the quadruple-precision floating-point value `a' is less than
7015 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
7016 | exception. Otherwise, the comparison is performed according to the IEC/IEEE
7017 | Standard for Binary Floating-Point Arithmetic.
7018 *----------------------------------------------------------------------------*/
7020 int float128_lt_quiet(float128 a, float128 b, float_status *status)
7024 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7025 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7026 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7027 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7029 if ( float128_is_signaling_nan( a )
7030 || float128_is_signaling_nan( b ) ) {
7031 float_raise(float_flag_invalid, status);
7035 aSign = extractFloat128Sign( a );
7036 bSign = extractFloat128Sign( b );
7037 if ( aSign != bSign ) {
7040 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
7044 aSign ? lt128( b.high, b.low, a.high, a.low )
7045 : lt128( a.high, a.low, b.high, b.low );
7049 /*----------------------------------------------------------------------------
7050 | Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
7051 | be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
7052 | comparison is performed according to the IEC/IEEE Standard for Binary
7053 | Floating-Point Arithmetic.
7054 *----------------------------------------------------------------------------*/
7056 int float128_unordered_quiet(float128 a, float128 b, float_status *status)
7058 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
7059 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
7060 || ( ( extractFloat128Exp( b ) == 0x7FFF )
7061 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
7063 if ( float128_is_signaling_nan( a )
7064 || float128_is_signaling_nan( b ) ) {
7065 float_raise(float_flag_invalid, status);
7072 /* misc functions */
7073 float32 uint32_to_float32(uint32_t a, float_status *status)
7075 return int64_to_float32(a, status);
7078 float64 uint32_to_float64(uint32_t a, float_status *status)
7080 return int64_to_float64(a, status);
7083 uint32 float32_to_uint32(float32 a, float_status *status)
7087 int old_exc_flags = get_float_exception_flags(status);
7089 v = float32_to_int64(a, status);
7092 } else if (v > 0xffffffff) {
7097 set_float_exception_flags(old_exc_flags, status);
7098 float_raise(float_flag_invalid, status);
7102 uint32 float32_to_uint32_round_to_zero(float32 a, float_status *status)
7106 int old_exc_flags = get_float_exception_flags(status);
7108 v = float32_to_int64_round_to_zero(a, status);
7111 } else if (v > 0xffffffff) {
7116 set_float_exception_flags(old_exc_flags, status);
7117 float_raise(float_flag_invalid, status);
7121 int_fast16_t float32_to_int16(float32 a, float_status *status)
7125 int old_exc_flags = get_float_exception_flags(status);
7127 v = float32_to_int32(a, status);
7130 } else if (v > 0x7fff) {
7136 set_float_exception_flags(old_exc_flags, status);
7137 float_raise(float_flag_invalid, status);
7141 uint_fast16_t float32_to_uint16(float32 a, float_status *status)
7145 int old_exc_flags = get_float_exception_flags(status);
7147 v = float32_to_int32(a, status);
7150 } else if (v > 0xffff) {
7156 set_float_exception_flags(old_exc_flags, status);
7157 float_raise(float_flag_invalid, status);
7161 uint_fast16_t float32_to_uint16_round_to_zero(float32 a, float_status *status)
7165 int old_exc_flags = get_float_exception_flags(status);
7167 v = float32_to_int64_round_to_zero(a, status);
7170 } else if (v > 0xffff) {
7175 set_float_exception_flags(old_exc_flags, status);
7176 float_raise(float_flag_invalid, status);
7180 uint32 float64_to_uint32(float64 a, float_status *status)
7184 int old_exc_flags = get_float_exception_flags(status);
7186 v = float64_to_uint64(a, status);
7187 if (v > 0xffffffff) {
7192 set_float_exception_flags(old_exc_flags, status);
7193 float_raise(float_flag_invalid, status);
7197 uint32 float64_to_uint32_round_to_zero(float64 a, float_status *status)
7201 int old_exc_flags = get_float_exception_flags(status);
7203 v = float64_to_uint64_round_to_zero(a, status);
7204 if (v > 0xffffffff) {
7209 set_float_exception_flags(old_exc_flags, status);
7210 float_raise(float_flag_invalid, status);
7214 int_fast16_t float64_to_int16(float64 a, float_status *status)
7218 int old_exc_flags = get_float_exception_flags(status);
7220 v = float64_to_int32(a, status);
7223 } else if (v > 0x7fff) {
7229 set_float_exception_flags(old_exc_flags, status);
7230 float_raise(float_flag_invalid, status);
7234 uint_fast16_t float64_to_uint16(float64 a, float_status *status)
7238 int old_exc_flags = get_float_exception_flags(status);
7240 v = float64_to_int32(a, status);
7243 } else if (v > 0xffff) {
7249 set_float_exception_flags(old_exc_flags, status);
7250 float_raise(float_flag_invalid, status);
7254 uint_fast16_t float64_to_uint16_round_to_zero(float64 a, float_status *status)
7258 int old_exc_flags = get_float_exception_flags(status);
7260 v = float64_to_int64_round_to_zero(a, status);
7263 } else if (v > 0xffff) {
7268 set_float_exception_flags(old_exc_flags, status);
7269 float_raise(float_flag_invalid, status);
7273 /*----------------------------------------------------------------------------
7274 | Returns the result of converting the double-precision floating-point value
7275 | `a' to the 64-bit unsigned integer format. The conversion is
7276 | performed according to the IEC/IEEE Standard for Binary Floating-Point
7277 | Arithmetic---which means in particular that the conversion is rounded
7278 | according to the current rounding mode. If `a' is a NaN, the largest
7279 | positive integer is returned. If the conversion overflows, the
7280 | largest unsigned integer is returned. If 'a' is negative, the value is
7281 | rounded and zero is returned; negative values that do not round to zero
7282 | will raise the inexact exception.
7283 *----------------------------------------------------------------------------*/
7285 uint64_t float64_to_uint64(float64 a, float_status *status)
7288 int_fast16_t aExp, shiftCount;
7289 uint64_t aSig, aSigExtra;
7290 a = float64_squash_input_denormal(a, status);
7292 aSig = extractFloat64Frac(a);
7293 aExp = extractFloat64Exp(a);
7294 aSign = extractFloat64Sign(a);
7295 if (aSign && (aExp > 1022)) {
7296 float_raise(float_flag_invalid, status);
7297 if (float64_is_any_nan(a)) {
7298 return LIT64(0xFFFFFFFFFFFFFFFF);
7304 aSig |= LIT64(0x0010000000000000);
7306 shiftCount = 0x433 - aExp;
7307 if (shiftCount <= 0) {
7309 float_raise(float_flag_invalid, status);
7310 return LIT64(0xFFFFFFFFFFFFFFFF);
7313 aSig <<= -shiftCount;
7315 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7317 return roundAndPackUint64(aSign, aSig, aSigExtra, status);
7320 uint64_t float64_to_uint64_round_to_zero(float64 a, float_status *status)
7322 signed char current_rounding_mode = status->float_rounding_mode;
7323 set_float_rounding_mode(float_round_to_zero, status);
7324 int64_t v = float64_to_uint64(a, status);
7325 set_float_rounding_mode(current_rounding_mode, status);
7329 #define COMPARE(s, nan_exp) \
7330 static inline int float ## s ## _compare_internal(float ## s a, float ## s b,\
7331 int is_quiet, float_status *status) \
7333 flag aSign, bSign; \
7334 uint ## s ## _t av, bv; \
7335 a = float ## s ## _squash_input_denormal(a, status); \
7336 b = float ## s ## _squash_input_denormal(b, status); \
7338 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7339 extractFloat ## s ## Frac( a ) ) || \
7340 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7341 extractFloat ## s ## Frac( b ) )) { \
7343 float ## s ## _is_signaling_nan( a ) || \
7344 float ## s ## _is_signaling_nan( b ) ) { \
7345 float_raise(float_flag_invalid, status); \
7347 return float_relation_unordered; \
7349 aSign = extractFloat ## s ## Sign( a ); \
7350 bSign = extractFloat ## s ## Sign( b ); \
7351 av = float ## s ## _val(a); \
7352 bv = float ## s ## _val(b); \
7353 if ( aSign != bSign ) { \
7354 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
7356 return float_relation_equal; \
7358 return 1 - (2 * aSign); \
7362 return float_relation_equal; \
7364 return 1 - 2 * (aSign ^ ( av < bv )); \
7369 int float ## s ## _compare(float ## s a, float ## s b, float_status *status) \
7371 return float ## s ## _compare_internal(a, b, 0, status); \
7374 int float ## s ## _compare_quiet(float ## s a, float ## s b, \
7375 float_status *status) \
7377 return float ## s ## _compare_internal(a, b, 1, status); \
7383 static inline int floatx80_compare_internal(floatx80 a, floatx80 b,
7384 int is_quiet, float_status *status)
7388 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7389 ( extractFloatx80Frac( a )<<1 ) ) ||
7390 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7391 ( extractFloatx80Frac( b )<<1 ) )) {
7393 floatx80_is_signaling_nan( a ) ||
7394 floatx80_is_signaling_nan( b ) ) {
7395 float_raise(float_flag_invalid, status);
7397 return float_relation_unordered;
7399 aSign = extractFloatx80Sign( a );
7400 bSign = extractFloatx80Sign( b );
7401 if ( aSign != bSign ) {
7403 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7404 ( ( a.low | b.low ) == 0 ) ) {
7406 return float_relation_equal;
7408 return 1 - (2 * aSign);
7411 if (a.low == b.low && a.high == b.high) {
7412 return float_relation_equal;
7414 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7419 int floatx80_compare(floatx80 a, floatx80 b, float_status *status)
7421 return floatx80_compare_internal(a, b, 0, status);
7424 int floatx80_compare_quiet(floatx80 a, floatx80 b, float_status *status)
7426 return floatx80_compare_internal(a, b, 1, status);
7429 static inline int float128_compare_internal(float128 a, float128 b,
7430 int is_quiet, float_status *status)
7434 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7435 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7436 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7437 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7439 float128_is_signaling_nan( a ) ||
7440 float128_is_signaling_nan( b ) ) {
7441 float_raise(float_flag_invalid, status);
7443 return float_relation_unordered;
7445 aSign = extractFloat128Sign( a );
7446 bSign = extractFloat128Sign( b );
7447 if ( aSign != bSign ) {
7448 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7450 return float_relation_equal;
7452 return 1 - (2 * aSign);
7455 if (a.low == b.low && a.high == b.high) {
7456 return float_relation_equal;
7458 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7463 int float128_compare(float128 a, float128 b, float_status *status)
7465 return float128_compare_internal(a, b, 0, status);
7468 int float128_compare_quiet(float128 a, float128 b, float_status *status)
7470 return float128_compare_internal(a, b, 1, status);
7473 /* min() and max() functions. These can't be implemented as
7474 * 'compare and pick one input' because that would mishandle
7475 * NaNs and +0 vs -0.
7477 * minnum() and maxnum() functions. These are similar to the min()
7478 * and max() functions but if one of the arguments is a QNaN and
7479 * the other is numerical then the numerical argument is returned.
7480 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7481 * and maxNum() operations. min() and max() are the typical min/max
7482 * semantics provided by many CPUs which predate that specification.
7484 * minnummag() and maxnummag() functions correspond to minNumMag()
7485 * and minNumMag() from the IEEE-754 2008.
7488 static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
7489 int ismin, int isieee, \
7491 float_status *status) \
7493 flag aSign, bSign; \
7494 uint ## s ## _t av, bv, aav, abv; \
7495 a = float ## s ## _squash_input_denormal(a, status); \
7496 b = float ## s ## _squash_input_denormal(b, status); \
7497 if (float ## s ## _is_any_nan(a) || \
7498 float ## s ## _is_any_nan(b)) { \
7500 if (float ## s ## _is_quiet_nan(a) && \
7501 !float ## s ##_is_any_nan(b)) { \
7503 } else if (float ## s ## _is_quiet_nan(b) && \
7504 !float ## s ## _is_any_nan(a)) { \
7508 return propagateFloat ## s ## NaN(a, b, status); \
7510 aSign = extractFloat ## s ## Sign(a); \
7511 bSign = extractFloat ## s ## Sign(b); \
7512 av = float ## s ## _val(a); \
7513 bv = float ## s ## _val(b); \
7515 aav = float ## s ## _abs(av); \
7516 abv = float ## s ## _abs(bv); \
7519 return (aav < abv) ? a : b; \
7521 return (aav < abv) ? b : a; \
7525 if (aSign != bSign) { \
7527 return aSign ? a : b; \
7529 return aSign ? b : a; \
7533 return (aSign ^ (av < bv)) ? a : b; \
7535 return (aSign ^ (av < bv)) ? b : a; \
7540 float ## s float ## s ## _min(float ## s a, float ## s b, \
7541 float_status *status) \
7543 return float ## s ## _minmax(a, b, 1, 0, 0, status); \
7546 float ## s float ## s ## _max(float ## s a, float ## s b, \
7547 float_status *status) \
7549 return float ## s ## _minmax(a, b, 0, 0, 0, status); \
7552 float ## s float ## s ## _minnum(float ## s a, float ## s b, \
7553 float_status *status) \
7555 return float ## s ## _minmax(a, b, 1, 1, 0, status); \
7558 float ## s float ## s ## _maxnum(float ## s a, float ## s b, \
7559 float_status *status) \
7561 return float ## s ## _minmax(a, b, 0, 1, 0, status); \
7564 float ## s float ## s ## _minnummag(float ## s a, float ## s b, \
7565 float_status *status) \
7567 return float ## s ## _minmax(a, b, 1, 1, 1, status); \
7570 float ## s float ## s ## _maxnummag(float ## s a, float ## s b, \
7571 float_status *status) \
7573 return float ## s ## _minmax(a, b, 0, 1, 1, status); \
7580 /* Multiply A by 2 raised to the power N. */
7581 float32 float32_scalbn(float32 a, int n, float_status *status)
7587 a = float32_squash_input_denormal(a, status);
7588 aSig = extractFloat32Frac( a );
7589 aExp = extractFloat32Exp( a );
7590 aSign = extractFloat32Sign( a );
7592 if ( aExp == 0xFF ) {
7594 return propagateFloat32NaN(a, a, status);
7600 } else if (aSig == 0) {
7608 } else if (n < -0x200) {
7614 return normalizeRoundAndPackFloat32(aSign, aExp, aSig, status);
7617 float64 float64_scalbn(float64 a, int n, float_status *status)
7623 a = float64_squash_input_denormal(a, status);
7624 aSig = extractFloat64Frac( a );
7625 aExp = extractFloat64Exp( a );
7626 aSign = extractFloat64Sign( a );
7628 if ( aExp == 0x7FF ) {
7630 return propagateFloat64NaN(a, a, status);
7635 aSig |= LIT64( 0x0010000000000000 );
7636 } else if (aSig == 0) {
7644 } else if (n < -0x1000) {
7650 return normalizeRoundAndPackFloat64(aSign, aExp, aSig, status);
7653 floatx80 floatx80_scalbn(floatx80 a, int n, float_status *status)
7659 aSig = extractFloatx80Frac( a );
7660 aExp = extractFloatx80Exp( a );
7661 aSign = extractFloatx80Sign( a );
7663 if ( aExp == 0x7FFF ) {
7665 return propagateFloatx80NaN(a, a, status);
7679 } else if (n < -0x10000) {
7684 return normalizeRoundAndPackFloatx80(status->floatx80_rounding_precision,
7685 aSign, aExp, aSig, 0, status);
7688 float128 float128_scalbn(float128 a, int n, float_status *status)
7692 uint64_t aSig0, aSig1;
7694 aSig1 = extractFloat128Frac1( a );
7695 aSig0 = extractFloat128Frac0( a );
7696 aExp = extractFloat128Exp( a );
7697 aSign = extractFloat128Sign( a );
7698 if ( aExp == 0x7FFF ) {
7699 if ( aSig0 | aSig1 ) {
7700 return propagateFloat128NaN(a, a, status);
7705 aSig0 |= LIT64( 0x0001000000000000 );
7706 } else if (aSig0 == 0 && aSig1 == 0) {
7714 } else if (n < -0x10000) {
7719 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1