Search code examples
algorithmmathfloating-pointieee-754approximation

Best non-trigonometric floating point approximation of tanh(x) in 10 instructions or less


Description

I need a reasonably accurate fast hyperbolic tangent for a machine that has no built-in floating point trigonometry, so e.g. the usual tanh(x) = (exp(2x) - 1) / (exp(2x) + 1) formula is going to need an approximation of exp(2x).
All other instructions like addition, subtraction, multiplication, division, and even FMA (= MUL+ADD in 1 op) are present.

Right now I have several approximations, but none of them are satisfactory in terms of accuracy.

[Update from the comments:]

  • The instruction for trunc()/floor() is available
  • There is a way to transparently reinterpret floats as integers and do all kinds of bit ops
  • There is a family of instructions called SEL.xx (.GT, .LE, etc.) which compare 2 values and choose what to write to the destination
  • DIVs are twice as slow, so nothing exceptional, DIVs are okay to use

Approach 1

Accuracy: ±1.2% absolute error, see here.

Pseudocode (A = accumulator register, T = temporary register):

[1] FMA T, 36.f / 73.f, A, A   // T := 36/73 + X^2
[2] MUL A, A, T                // A := X(36/73 + X^2)
[3] ABS T, A                   // T := |X(36/73 + X^2)|
[4] ADD T, T, 32.f / 73.f      // T := |X(36/73 + X^2)| + 32/73
[5] DIV A, A, T                // A := X(36/73 + X^2) / (|X(36/73 + X^2)| + 32/73)

Approach 2

Accuracy: ±0.9% absolute error, see here.

Pseudocode (A = accumulator register, T = temporary register):

[1] FMA T, 3.125f, A, A        // T := 3.125 + X^2
[2] DIV T, 25.125f, T          // T := 25.125/(3.125 + X^2)
[3] MUL A, A, 0.1073f          // A := 0.1073*X
[4] FMA A, A, A, T             // A := 0.1073*X + 0.1073*X*25.125/(3.125 + X^2)
[5] MIN A, A, 1.f              // A := min(0.1073*X + 0.1073*X*25.125/(3.125 + X^2), 1)
[6] MAX A, A, -1.f             // A := max(min(0.1073*X + 0.1073*X*25.125/(3.125 + X^2), 1), -1)

Approach 3

Accuracy: ±0.13% absolute error, see here.

Pseudocode (A = accumulator register, T = temporary register):

[1] FMA T, 14.f, A, A          // T := 14 + X^2
[2] FMA T, -133.f, T, T        // T := (14 + X^2)^2 - 133
[3] DIV T, A, T                // T := X/((14 + X^2)^2 - 133)
[4] FMA A, 52.5f, A, A         // A := 52.5 + X^2
[5] MUL A, A, RSQRT(15.f)      // A := (52.5 + X^2)/sqrt(15)
[6] FMA A, -120.75f, A, A      // A := (52.5 + X^2)^2/15 - 120.75
[7] MUL A, A, T                // A := ((52.5 + X^2)^2/15 - 120.75)*X/((14 + X^2)^2 - 133)
[8] MIN A, A, 1.f              // A := min(((52.5 + X^2)^2/15 - 120.75)*X/((14 + X^2)^2 - 133), 1)
[9] MAX A, A, -1.f             // A := max(min(((52.5 + X^2)^2/15 - 120.75)*X/((14 + X^2)^2 - 133), 1), -1)

The question

Is there anything better that can possibly fit in 10 non-trigonometric float32 instructions?


Solution

  • After doing much exploratory work, I came to the conclusion that approach 2 is the most promising direction. Since division is very fast on the asker's platform, rational approximations are attractive. The platform's support for FMA should be exploited aggressively. Below I am showing C code that implements a fast tanhf() in seven operations and achieves maximum absolute error of less than 2.8e-3.

    I used the Remez algorithm to compute the coefficients for the rational approximation and used a heuristic search to reduce these coefficients to as few bits as feasible, which may benefit some processor architectures that are able to incorporate floating-point data into an immediate field of commonly used floating-point instructions.

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    
    /* Fast computation of hyperbolic tangent. Rational approximation with clamping.
       Maximum absolute errror = 2.77074604e-3 @ +/-3.29019976
    */
    float fast_tanhf_rat (float x)
    {
        const float n0 = -8.73291016e-1f; // -0x1.bf2000p-1
        const float n1 = -2.76107788e-2f; // -0x1.c46000p-6
        const float d0 =  2.79589844e+0f; //  0x1.65e000p+1
        float x2 = x * x;
        float num = fmaf (n0, x2, n1);
        float den = x2 + d0;
        float quot = num / den;
        float res = fmaf (quot, x, x);
        res = fminf (fmaxf (res, -1.0f), 1.0f);
        return res;
    }
    
    int main (void)
    {
        double ref, err, maxerr = 0;
        float arg, res, maxerrloc = INFINITY;
        maxerr = 0;
        arg = 0.0f;
        while (arg < 0x1.0p64f) {
            res = fast_tanhf_rat (arg);
            ref = tanh ((double)arg);
            err = fabs ((double)res - ref);
            if (err > maxerr) {
                maxerr = err;
                maxerrloc = arg;
            }
            arg = nextafterf (arg, INFINITY);
        }
        arg = -0.0f;
        while (arg > -0x1.0p64f) {
            res = fast_tanhf_rat (arg);
            ref = tanh ((double)arg);
            err = fabs ((double)res - ref);
            if (err > maxerr) {
                maxerr = err;
                maxerrloc = arg;
            }
            arg = nextafterf (arg, -INFINITY);
        }
        printf ("maximum absolute error = %15.8e @ %15.8e\n", maxerr, maxerrloc);
        return EXIT_SUCCESS;
    }
    

    Given that asker budgeted for up to ten operations, we can increase the degree of both numerator and denominator polynomials by one to achieve a fast tanhf() implementation comprising nine operations that has significantly lower maximum absolute error, less than 5.8e-5:

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    
    /* Fast computation of hyperbolic tangent. Rational approximation with clamping.
       Maximum absolute error = 5.77514052e-5 @ +/-=2.22759748
     */
    float fast_tanhf_rat2 (float x)
    {
        const float n0 = -9.49066162e-1f; // -0x1.e5ec00p-1
        const float n1 = -2.68447266e+1f; // -0x1.ad8400p+4
        const float n2 = -2.01115608e-2f; // -0x1.498200p-6
        const float d0 =  3.49853516e+1f; //  0x1.17e200p+5
        const float d1 =  8.07031250e+1f; //  0x1.42d000p+6
        float x2 = x * x;
        float num = fmaf (fmaf (n0, x2, n1), x2, n2);
        float den = fmaf (x2 + d0, x2, d1);
        float quot = num / den;
        float res = fmaf (quot, x, x);
        res = fminf (fmaxf (res, -1.0f), 1.0f);
        return res;
    }
    
    int main (void)
    {
        double ref, err, maxerr = 0;
        float arg, res, maxerrloc = INFINITY;
        maxerr = 0;
        arg = 0.0f;
        while (arg < 0x1.0p32f) {
            res = fast_tanhf_rat2 (arg);
            ref = tanh ((double)arg);
            err = fabs ((double)res - ref);
            if (err > maxerr) {
                maxerr = err;
                maxerrloc = arg;
            }
            arg = nextafterf (arg, INFINITY);
        }
        arg = -0.0f;
        while (arg > -0x1.0p32f) {
            res = fast_tanhf_rat2 (arg);
            ref = tanh ((double)arg);
            err = fabs ((double)res - ref);
            if (err > maxerr) {
                maxerr = err;
                maxerrloc = arg;
            }
            arg = nextafterf (arg, -INFINITY);
        }
        printf ("maximum absolute error = %15.8e @ %15.8e\n", maxerr, maxerrloc);
        return EXIT_SUCCESS;
    }
    

    Clamping the output of the approximation to the interval [-1, 1] is unnecessary if we can guarantee that the approximation can produces values outside this range. Single-precision implementations can be tested exhaustively, so one can show that by adjusting the coefficients of the approximation slightly this can be successfully enforces. By clamping the argument to a specific single-precision number for which the approximation returns the value ±1, the correct asymptotic behavior is achieved. This requires that all basic arithmetic operations and in particular the division are compliant with IEEE-754 and thus correctly rounded, all operands are IEEE-754 binary32 operands, and that rounding to nearest-or-even is in effect. Using the maximum of 10 operations allowed by the asker, maximum absolute and relative error of less than 2.0e-5 can be achieved:

    #include <stdio.h>
    #include <stdlib.h>
    #include <math.h>
    
    /* Fast computation of hyperbolic tangent. Rational approximation with clamping
       of the argument. Maximum absolute error = 1.98537030e-5, maximum relative
       error = 1.98540995e-5, maximum ulp error = 333.089863.
    */
    float fast_tanhf_rat3 (float x) // 10 operations
    {
        const float cutoff = 5.76110792f; //  0x1.70b5fep+2
        const float n0 = -1.60153955e-4f; // -0x1.4fde00p-13
        const float n1 = -9.34448242e-1f; // -0x1.de7000p-1
        const float n2 = -2.19176636e+1f; // -0x1.5eaec0p+4
        const float d0 =  2.90915985e+1f; //  0x1.d17730p+4
        const float d1 =  6.57667847e+1f; //  0x1.071130p+6
        float y = fminf (fmaxf (x, -cutoff), cutoff);
        float y2 = y * y;
        float num = fmaf (fmaf (n0, y2, n1), y2, n2) * y2;
        float den = fmaf (y2 + d0, y2, d1);
        float quot = num / den;
        float res = fmaf (quot, y, y);
        return res;
    }
    
    int main (void)
    {
        double ref, abserr, relerr, maxabserr = 0, maxrelerr = 0;
        float arg, res, maxabserrloc = INFINITY, maxrelerrloc = INFINITY;
    
        maxabserr = 0;
        maxrelerr = 0;
        arg = 0.0f;
        while (arg < INFINITY) {
            res = fast_tanhf_rat3 (arg);
            if (res > 1) { 
                printf ("error at %15.8e: result out of bounds\n", arg);
                return EXIT_FAILURE;
            }
            ref = tanh ((double)arg);
            abserr = fabs ((double)res - ref);
            if (abserr > maxabserr) {
                maxabserr = abserr;
                maxabserrloc = arg;
            }
            relerr = fabs (((double)res - ref) / ref);
            if (relerr > maxrelerr) {
                maxrelerr = relerr;
                maxrelerrloc = arg;
            }
            arg = nextafterf (arg, INFINITY);
        }
        arg = -0.0f;
        while (arg > -INFINITY) {
            res = fast_tanhf_rat3 (arg);
            if (res < -1) { 
                printf ("error at %15.8e: result out of bounds\n", arg);
                return EXIT_FAILURE;
            }
            ref = tanh ((double)arg);
            abserr = fabs ((double)res - ref);
            if (abserr > maxabserr) {
                maxabserr = abserr;
                maxabserrloc = arg;
            }
            relerr = fabs (((double)res - ref) / ref);
            if (relerr > maxrelerr) {
                maxrelerr = relerr;
                maxrelerrloc = arg;
            }
            arg = nextafterf (arg, -INFINITY);
        }
        printf ("maximum absolute error = %15.8e @ %15.8e\n", maxabserr, maxabserrloc);
        printf ("maximum relative error = %15.8e @ %15.8e\n", maxrelerr, maxrelerrloc);
        return EXIT_SUCCESS;
    }