Search code examples
cbinaryunionslow-level

Union - Binary to Double


Hello guys I am trying to implement the new frexp function by hand. To be able to do this I used Union data type. I can find the exponent correctly but my problem is about the mantis part. I cannot find the correct value for mantisa part. It gives me really big number but when . I tried to shift the binary but it also didn't help me. Do you have any idea how I can find the mantis from this binary. Thank you. (This is for double floating unit and I assumed double is 64 bit)

P.s. There is one more thing I didn't get. To be able to find the right exponent value, I suppose to decrease that value 1023 in theory (bias property), but in this example I needed to decrease 1022 to find the right value. Is something wrong?

typedef union {
    double f;
    struct {
        unsigned long  mantisa : 52;
        unsigned long  exponent : 11;
        unsigned long  sign : 1;
    } parts;
} double_cast;

double myfrexp(double number, int *exp)
{
    double_cast d1;
    d1.f = number;
    unsigned long dd;
    printf("\n %x \n", d1.parts.exponent);
    *exp = d1.parts.exponent - 1022;
    printf("\n%d\n\n", *exp);
    printf("\n %lf \n", (double)d1.parts.mantisa);
    return d1.parts.mantisa;
}

Thank you


Solution

  • To lessen endian-ness issues and inability to have 52 bit int fields, use a union of double and uint64_t.

    It is an assumption that double endian-ness and integer endian-ness are the same. Most systems do that - but not all. The following depends on that.

    The +1 in your post and expo + 1 below is because 1.0 <= IEEE Significand (not mantissa) < 2.0, but frexp(): 0.5 <= normalized fraction < 1.0.

    double myfrexp(double number, int *exp) {
      static const uint64_t mantissa_mask       = 0x000FFFFFFFFFFFFFllu;
      static const uint64_t mantissa_impliedBit = 0x0010000000000000llu;
      static const uint64_t expo_mask           = 0x7FF0000000000000llu;
      static const uint64_t expo_norm           = 0x3FE0000000000000llu;
      static const uint64_t sign_mask           = 0x8000000000000000llu;
      static const int expo_NaN = 0x07FF;
      static const int expo_Bias = 1023;
    
      union {
        double d;
        uint64_t u;
      } x = { number };
      uint64_t mantissa = x.u & mantissa_mask;
      int expo = (x.u & expo_mask) >> 52;
    
      if (expo == expo_NaN) {  // Behavior for Infinity and NaN is unspecified.
        *exp = 0;
        return number;
      }
      if (expo > 0) {
        mantissa |= mantissa_impliedBit;  // This line is illustrative, not needed.
        expo -= expo_Bias;
      }
      else if (mantissa == 0) {
        *exp = 0;
        return number;  // Do not return 0.0 as that does not preserve -0.0
      }
      else {
        // de-normal or sub-normal numbers
        expo = 1 - expo_Bias;  // Bias different when biased exponent is 0
        while (mantissa < mantissa_impliedBit) {
          mantissa <<= 1;
          expo--;
        }
      }
      *exp = expo + 1;
      mantissa &= ~mantissa_impliedBit;
      x.u = (x.u & sign_mask) | expo_norm | mantissa;
      return x.d;
    }
    
    #include <limits.h>
    #include <math.h>
    #include <memory.h>
    #include <stdio.h>
    #include <float.h>
    
    void frexp_test(double d) {
      int i1,i2;
      double d1,d2;
      d1 = frexp(d, &i1);
      d2 = myfrexp(d, &i2);
      if (memcmp(&d1,&d2,sizeof(d1)) != 0 || (i1 != i2)) {
        printf("%a  (%a %x) (%a %x)\n", d, d1, i1, d2, i2);
      }
    }
    
    int main() {
      frexp_test(1.0);
      frexp_test(0.0);
      frexp_test(-0.0);
      frexp_test(DBL_MAX);
      frexp_test(-DBL_MAX);
      frexp_test(DBL_EPSILON);
      frexp_test(DBL_MIN);
      frexp_test(DBL_MIN/1024);
      frexp_test(DBL_MIN/1024/1024);
      frexp_test(INFINITY);
      //frexp_test(DBL_TRUE_MIN);
      return 0;
    }