Search code examples
c++c++11optimizationcmath

Best way calculating remainder on floating points


Question

What is the computational cost of the remainder function, is there a specific instruction to calculate it in a cheap way in a specific case?

Description

I need to transform a mathematical variable x into the range of I=[-0.5; 0.5) from R=[-2; 2). While x is not element of I, then x is shifted towards I by repeatedly adding or subtracting 1 to the value of x. x is represented with double x in my code. I need the fastest way of this transformation for this I and R values but wider R ranges can be also interesting.

Ideas and speed comparison

The function I was suggested to use was the naive implementation following the description:

void shift_to_I(double& x) // version 1
{
    while (x < -0.5)
            x += 1;

    while (x >= 0.5)
            x -= 1;
}

Not only for speed considerations but also for code quality I was thinking of using remainder from <cmath> introduced in c++11. With remainder the code shortens to

void shift_to_I(double& x) // version 2
{
    x = remainder(x,1);
}

I had to realize though that it was slower than the original function on my architecture (Intel i7 whatver with VC++). I believed there was a dedicated instruction for this purpose, but either the compiler doesn't know it or it doesn't exist. For wider R interval (on my architecture it is around [-25; 25)) the second version will be faster but I need a code that is fast for narrow intervals too. clang and gcc specific solutions are also welcomed.


Solution

  • This question is compiler and implementation dependent.

    For example, on my machine with GCC 8.3:

    1. Without -ffast-math, std::remainder translates into a call to this function:

      double __remainder(double x, double y)
      {
          if (((__builtin_expect (y == 0.0, 0) && ! isnan(x)) || (__builtin_expect(isinf(x), 0) && ! isnan(y))) && _LIB_VERSION != _IEEE_)
              return __kernel_standard(x, y, 28);
          return __ieee754_remainder(x, y);
      }
      

      with __ieee754_remainder looking like this:

      double __ieee754_remainder(double x, double y)
      {
          double z, d, xx;
          int4 kx, ky, n, nn, n1, m1, l;
          mynumber u, t, w = {{0, 0}}, v = {{0, 0}}, ww = {{0, 0}}, r;
          u.x = x;
          t.x = y;
          kx = u.i[HIGH_HALF] & 0x7fffffff; /* no sign  for x*/
          t.i[HIGH_HALF] &= 0x7fffffff;     /*no sign for y */
          ky = t.i[HIGH_HALF];
          /*------ |x| < 2^1023  and   2^-970 < |y| < 2^1024 ------------------*/
          if (kx < 0x7fe00000 && ky < 0x7ff00000 && ky >= 0x03500000)
          {
              SET_RESTORE_ROUND_NOEX(FE_TONEAREST);
              if (kx + 0x00100000 < ky)
                  return x;
              if ((kx - 0x01500000) < ky)
              {
                  z = x / t.x;
                  v.i[HIGH_HALF] = t.i[HIGH_HALF];
                  d = (z + big.x) - big.x;
                  xx = (x - d * v.x) - d * (t.x - v.x);
                  if (d - z != 0.5 && d - z != -0.5)
                      return (xx != 0) ? xx : ((x > 0) ? ZERO.x : nZERO.x);
                  else
                  {
                      if (fabs(xx) > 0.5 * t.x)
                          return (z > d) ? xx - t.x : xx + t.x;
                      else
                          return xx;
                  }
              } /*    (kx<(ky+0x01500000))         */
              else
              {
                  r.x = 1.0 / t.x;
                  n = t.i[HIGH_HALF];
                  nn = (n & 0x7ff00000) + 0x01400000;
                  w.i[HIGH_HALF] = n;
                  ww.x = t.x - w.x;
                  l = (kx - nn) & 0xfff00000;
                  n1 = ww.i[HIGH_HALF];
                  m1 = r.i[HIGH_HALF];
                  while (l > 0)
                  {
                      r.i[HIGH_HALF] = m1 - l;
                      z = u.x * r.x;
                      w.i[HIGH_HALF] = n + l;
                      ww.i[HIGH_HALF] = (n1) ? n1 + l : n1;
                      d = (z + big.x) - big.x;
                      u.x = (u.x - d * w.x) - d * ww.x;
                      l = (u.i[HIGH_HALF] & 0x7ff00000) - nn;
                  }
                  r.i[HIGH_HALF] = m1;
                  w.i[HIGH_HALF] = n;
                  ww.i[HIGH_HALF] = n1;
                  z = u.x * r.x;
                  d = (z + big.x) - big.x;
                  u.x = (u.x - d * w.x) - d * ww.x;
                  if (fabs(u.x) < 0.5 * t.x)
                      return (u.x != 0) ? u.x : ((x > 0) ? ZERO.x : nZERO.x);
                      else if (fabs(u.x) > 0.5 * t.x)
                      return (d > z) ? u.x + t.x : u.x - t.x;
                  else
                  {
                      z = u.x / t.x;
                      d = (z + big.x) - big.x;
                      return ((u.x - d * w.x) - d * ww.x);
                  }
              }
          } /*   (kx<0x7fe00000&&ky<0x7ff00000&&ky>=0x03500000)     */
          else
          {
              if (kx < 0x7fe00000 && ky < 0x7ff00000 && (ky > 0 || t.i[LOW_HALF] != 0))
              {
                  y = fabs(y) * t128.x;
                  z = __ieee754_remainder(x, y) * t128.x;
                  z = __ieee754_remainder(z, y) * tm128.x;
                  return z;
              }
              else
              {
                  if ((kx & 0x7ff00000) == 0x7fe00000 && ky < 0x7ff00000 &&
                      (ky > 0 || t.i[LOW_HALF] != 0))
                  {
                      y = fabs(y);
                      z = 2.0 * __ieee754_remainder(0.5 * x, y);
                      d = fabs(z);
                      if (d <= fabs(d - y))
                          return z;
                      else if (d == y)
                          return 0.0 * x;
                      else
                          return (z > 0) ? z - y : z + y;
                  }
                  else /* if x is too big */
                  {
                      if (ky == 0 && t.i[LOW_HALF] == 0) /* y = 0 */
                          return (x * y) / (x * y);
                      else if (kx >= 0x7ff00000    /* x not finite */
                               || (ky > 0x7ff00000 /* y is NaN */
                                      || (ky == 0x7ff00000 && t.i[LOW_HALF] != 0)))
                          return (x * y) / (x * y);
                      else
                          return x;
                  }
              }
          }
      }
      

      Pretty far from a single machine instruction.

    2. With -ffast-math, single fprem1 assembly instruction is used.