Search code examples
floating-pointprecision

Floating Point: Distance between MAX and INF


Let's say you have a variable of a floating point type. 32 or 64 bit doesn't matter.

You assign the maximum representable value to the variable. Typical programming languages have a constant for that.

How do you determine the least value that you must add to your variable so that it 'snaps' over to infinity?

I am aware of functions like nextafter* and nexttoward* in C, next_up in Rust, etc. Those are related, but do not give me the value that I need.


Solution

  • Here is a solution for float in C. It uses FLT_ROUNDS, which may be changed during program execution. If a program does that, this code should use #pragma STDC FENV_ACCESS ON to inform the compiler it depends on the floating-point environment.

    #include <float.h>
    #include <math.h>
    #include <stdio.h>
    #include <stdlib.h>
    
    
    int main(void)
    {
        #if !defined INFINITY
    
            printf("Infinity is not representable, so no value added to %a can produce infinity.\n", FLT_MAX);
    
        #else
    
            /*  a will be set to the largest value that can be added that will
                not produce infinity, and b will be set to the smallest value that
                will produce infinity.
            */
            float a, b;
    
            switch (FLT_ROUNDS)
            {
                case  0:  // Toward zero.
                case  3:  // Downward, toward negative infinity.
                {
                    //  With rounding downward or toward zero, no finite value will round to +infinity.
                    a = FLT_MAX;
                    b = INFINITY;
                    break;
                }
    
                case  1:  // To nearest, ties to even.
                case  4:  // To nearest, ties away from zero.
                {
                    //  Determine ULP at FLT_MAX;
                    float u = FLT_MAX - nexttowardf(FLT_MAX, 0);
    
                    //  The smallest value that will produce infinity is half an ULP.
                    b = u/2;
                    a = nexttowardf(b, 0);
    
                    break;
                }
    
                case  2:  // Upward, toward positive infinity.
                {
                    //  With rounding upward, adding any non-negative value will produce infinity.
                    a = 0;
                    b = FLT_TRUE_MIN;
                    break;
                }
    
                case -1:  // Indeterminable, or, rather, the implementation will not tell us.
                {
                    //  Check whether boundary is between FLT_MAX and INFINITY.
                    if (FLT_MAX + FLT_MAX < INFINITY)
                    {
                        /*  Adding FLT_MAX does not produce infinity, so infinity
                            is the smallest value that does.
                        */
                        a = FLT_MAX;
                        b = INFINITY;
                    }
    
                    else
                    {
                        //  Otherwise, do a binary search.
                        a = 0;
                        b = FLT_MAX;
                        float middle;
                        while (nexttowardf(a, b) != b)
                        {
                            middle = (a + b) / 2;
                            if (FLT_MAX + middle < INFINITY)
                                a = middle;
                            else
                                b = middle;
                        };
                    }
    
                    break;
                }
    
                default:
                {
                    printf("FLT_ROUNDS is %d, which does not conform to the C 2024 standard.\n", FLT_ROUNDS);
                    exit(EXIT_FAILURE);
                }
            }
    
            printf("The smallest value that will produce infinity is %a.\n", b);
            printf("Demonstration:\n");
            printf("\t%a + %a = %a.\n", FLT_MAX, a, FLT_MAX + a);
            printf("\t%a + %a = %a.\n", FLT_MAX, b, FLT_MAX + b);
    
        #endif
    }