c floating-point precision ieee-754 sqrt

IEEE 754 conformant sqrtf() implementation taking into account hardware restrictions and usage limitations

Follow-up question for IEEE 754 conformant sqrt() implementation for double type.

Context: Need to implement IEEE 754 conformant sqrtf() taking into account the following HW restrictions and usage limitations:

Provides a special instruction qseed.f to get an approximation of the reciprocal of the square root (the accuracy of the result is no less than 6.75 bits, and therefore always within ±1% of the accurate result).
Single precision FP:

a. Support by HW (SP FPU): has support;

b. Support by SW (library): has support;

c. Support of subnormal numbers: no support (FLT_HAS_SUBNORM is 0).
Double precision FP:

a. Support by HW (DP FPU): no support;

b. Support by SW (library): has support;

c. Support of subnormal numbers: no support (DBL_HAS_SUBNORM is 0).

I've found one presentation by John Harrison and ended up with this implementation (note that here qseed.f is replaced by rsqrtf()):

#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

// https://github.com/nickzman/hyperspace/blob/master/frsqrt.hh
#if 1
float rsqrtf ( float x )
{
    const float xhalf = 0.5f * x;
    int         i     = *(int*) & x;

    i = 0x5f375a86 - ( i >> 1 );
    x = *(float*) & i;
    x = x * ( 1.5f - xhalf * x * x );
    x = x * ( 1.5f - xhalf * x * x );
    x = x * ( 1.5f - xhalf * x * x );

    return x;
}
#else
float rsqrtf ( float x )
{
    return 1.0f / sqrtf( x );
}
#endif

float   sqrtfr_jh( float x, float r )
{
    /*
     * John Harrison, Formal Verification Methods 5: Floating Point Verification,
     * Intel Corporation, 12 December 2002, document name: slides5.pdf, page 14,
     * slide "The square root algorithm".
     * URL: https://www.cl.cam.ac.uk/~jrh13/slides/anu-09_12dec02/slides5.pdf
     */
    double rd, b, z0, s0, d, k, h0, e, t0, s1, c, d1, h1, s;
    static const double half = 0.5;
    static const double one = 1.0;
    static const double three = 3.0;
    static const double two = 2.0;
    rd = (double)r;
    b = half * x;
    z0 = rd * rd;
    s0 = x * rd;
    d = fma( -b, z0, half );
    k = fma( x, rd, -s0 );
    h0 = half * rd;
    e = fma( three / two, d, one );
    t0 = fma( d, s0, k );
    s1 = fma( e, t0, s0 );
    c = fma( d, e, one );
    d1 = fma( -s1, s1, x );
    h1 = c * h0;
    s = fma( d1, h1, s1 );
    return (float)s;
}

float   my_sqrtf( float x )
{
    /* handle special cases */
    if (x == 0) {
        return x + x;
    }
    /* handle normal cases */
    if ((x > 0) && (x < INFINITY)) {
        return sqrtfr_jh( x, rsqrtf( x ) );
    }
    /* handle special cases */
    return (x < 0) ? NAN : (x + x);
}

/*
  https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
  From: geo <gmars...@gmail.com>
  Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
  Subject: 64-bit KISS RNGs
  Date: Sat, 28 Feb 2009 04:30:48 -0800 (PST)

  This 64-bit KISS RNG has three components, each nearly
  good enough to serve alone.    The components are:
  Multiply-With-Carry (MWC), period (2^121+2^63-1)
  Xorshift (XSH), period 2^64-1
  Congruential (CNG), period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64  (kiss64_t = (kiss64_x << 58) + kiss64_c, \
                kiss64_c = (kiss64_x >> 6), kiss64_x += kiss64_t, \
                kiss64_c += (kiss64_x < kiss64_t), kiss64_x)
#define XSH64  (kiss64_y ^= (kiss64_y << 13), kiss64_y ^= (kiss64_y >> 17), \
                kiss64_y ^= (kiss64_y << 43))
#define CNG64  (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)

int main (void)
{
    const uint64_t N = 10000000000ULL; /* desired number of test cases */
    float arg, ref, res;
    uint64_t argi64;
    uint32_t refi, resi;
    uint64_t count = 0;
    float spec[] = {0.0f, 1.0f, INFINITY, NAN};

    printf ("test a few special cases:\n");
    for (int i = 0; i < sizeof (spec)/sizeof(spec[0]); i++) {
        printf ("my_sqrt(%a) = %a\n", spec[i], my_sqrtf(spec[i]));
        printf ("my_sqrt(%a) = %a\n", -spec[i], my_sqrtf(-spec[i]));
    }
    
    printf ("test %lu random cases:\n", N);
    do {
        argi64 = KISS64;
        memcpy (&arg, &argi64, sizeof arg);
        if ( fpclassify(arg) == FP_SUBNORMAL )
        {
            continue;
        }
        ++count;
        res = my_sqrtf (arg);
        ref = sqrtf (arg);
        memcpy (&resi, &res, sizeof resi);
        memcpy (&refi, &ref, sizeof refi);
        if ( ! ( isnan(res) && isnan(ref) ) )
        if (resi != refi) {
            printf ("\rerror @ arg=%a (%e)\n", arg, arg);
            printf ("\rerror @ res=%a (%e)\n", res, res);
            printf ("\rerror @ ref=%a (%e)\n", ref, ref);
            return EXIT_FAILURE;
        }
        if ((count & 0xfffff) == 0) printf ("\r[%lu]", count);
    } while (count < N);
    printf ("\r[%lu]", count);
    printf ("\ntests PASSED\n");
    return EXIT_SUCCESS;
}

And it seems to work correctly (at least for some random cases): it reports:

[10000000000]
tests PASSED

Now the question: since the original John Harrison's sqrtf() algorithm uses only single precision computations (i.e. type float), it is possible to reduce the number of operations when using only (except conversions) double precision computations (i.e. type double) and still be IEEE 754 conformant?

P.S. Since users @njuffa and @chux - Reinstate Monica are strong in FP, I invite them to participate. However, all the competent in FP users are welcome.

Solution

Computing a single-precision square root via double-precision code is going to be inefficient, especially if the hardware provides no native double-precision operations.

The following assumes hardware that conforms to IEEE-754 (2008), except that subnormals are not supported and flushed to zero. Fused-multiply add (FMA) is supported. It further assumes an ISO-C99 compiler that maps float to IEEE-754 binary32, and that maps the hardware's single-precision FMA instruction to the standard math function fmaf().

From a hardware starting approximation for the reciprocal square root with a maximum relative error of 2^-6.75 one can get to a reciprocal square root accurate to 1 single-precision ulp with two Newton-Raphson iterations. Multiplying this with the original argument provides an accurate estimate of the square root. The square of this approximation is subtracted from the orginal argument to compute the approximation error for the square root. This error is then used to apply a correction to the square root approximation, resulting in a correctly-rounded square root.

However, this straightforward algorithm breaks down for arguments that are very small due to underflow or overflow in intermediate computation, in particular when the underlying arithmetic operates in flash-to-zero mode that flushes subnormals to zero. For such arguments we can construct a slowpath code that scales the input towards unity, and scales back the result accordingly once the square root has been computed. Code for handling special operands such as zeros, infinities, NaNs, and negative arguments other than zero is also added to this slowpath code.

The NaN generated by the slowpath code for invalid operations should be adjusted to match the system's existing operations. For example, for x86-based systems this would be a special QNaN called INDEFINITE, with a bit pattern of 0xffc00000, while for a GPU running CUDA it would be the canonical single-precision NaN with a bit pattern of 0x7fffffff.

For performance reasons it may be useful to inline the fastpath code while making the slowpath code a called outlined subroutine. Single-precision math functions with a single argument should always be tested exhaustively against a "golden" reference implementation, which takes just minutes on modern hardware.

#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

float uint32_as_float (uint32_t);
uint32_t float_as_uint32 (float);
float qseedf (float);
float sqrtf_slowpath (float);

/* Square root computation for IEEE-754 binary32 mapped to 'float' */
float my_sqrtf (float arg)
{
    const uint32_t upper = float_as_uint32 (0x1.fffffep+127f);
    const uint32_t lower = float_as_uint32 (0x1.000000p-102f);
    float rsq, sqt, err;

    /* use fastpath computation if argument in [0x1.0p-102, 0x1.0p+128) */
    if ((float_as_uint32 (arg) - lower) <= (upper - lower)) {
        /* generate low-accuracy approximation to rsqrt(arg) */
        rsq = qseedf (arg);
        
        /* apply two Newton-Raphson iterations with quadratic convergence */
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        
        /* compute sqrt from rsqrt, round result to nearest or even */
        sqt = rsq * arg;
        err = fmaf (sqt, -sqt, arg);
        sqt = fmaf (0.5f * rsq, err, sqt);
    } else {
        sqt = sqrtf_slowpath (arg);
    }
    return sqt;
}

/* interprete bit pattern of 32-bit unsigned integer as IEEE-754 binary32 */
float uint32_as_float (uint32_t a)
{
    float r;
    memcpy (&r, &a, sizeof r);
    return r;
}

/* interprete bit pattern of  IEEE-754 binary32 as a 32-bit unsigned integer */
uint32_t float_as_uint32 (float a)
{
    uint32_t r;
    memcpy (&r, &a, sizeof r);
    return r;
}

/* simulate low-accuracy hardware approximation to 1/sqrt(a) */
float qseedf (float a)
{
    float r = 1.0f / sqrtf (a);
    r = uint32_as_float (float_as_uint32 (r) & ~0x1ffff);
    return r;
}

/* square root computation suitable for all IEEE-754 binary32 arguments */
float sqrtf_slowpath (float arg)
{
    const float FP32_INFINITY = uint32_as_float (0x7f800000);
    const float FP32_QNAN = uint32_as_float (0xffc00000); /* system specific */
    const float scale_in  = 0x1.0p+26f;
    const float scale_out = 0x1.0p-13f;
    float rsq, err, sqt;

    if (arg < 0.0f) {
        return FP32_QNAN;
    } else if ((arg == 0.0f) || !(fabsf (arg) < FP32_INFINITY)) { /* Inf, NaN */
        return arg + arg;
    } else {
        /* scale subnormal arguments towards unity */
        arg = arg * scale_in;
        
        /* generate low-accuracy approximation to rsqrt(arg) */
        rsq = qseedf (arg);
        
        /* apply two Newton-Raphson iterations with quadratic convergence */
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        rsq = fmaf (fmaf (-0.5f * arg * rsq, rsq, 0.5f), rsq, rsq);
        
        /* compute sqrt from rsqrt, round to nearest or even */
        sqt = rsq * arg;
        err = fmaf (sqt, -sqt, arg);
        sqt = fmaf (0.5f * rsq, err, sqt);

        /* compensate scaling of argument by counter-scaling the result */
        sqt = sqt * scale_out;
        
        return sqt;
    }
}

int main (void)
{
    uint32_t ai, resi, refi;
    float a, res, reff;
    double ref;

    ai = 0x00000000;
    do {
        a = uint32_as_float (ai);
        res = my_sqrtf (a);
        ref = sqrt ((double)a);
        reff = (float)ref;
        resi = float_as_uint32 (res);
        refi = float_as_uint32 (reff);
        if (resi != refi) {
            printf ("error @ %08x %15.8e   res=%08x %15.8e  ref=%08x %15.8e\n",
                    ai, a, resi, res, refi, reff);
            return EXIT_FAILURE;
        }
        

        ai++;
    } while (ai);
    return EXIT_SUCCESS;
}