static __inline__ uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t* hip) {
__uint128_t product = ((__uint128_t)a)*((__uint128_t)b);
*hip = product>>64;
return (uint64_t)product;
}
I am trying to write following above using MULX intrinsics on AVX2 (more specifically BMI2). But they do not give the same results.
static __inline__ uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *c){
return _mulx_u64(a, b, &c);
}
It looks like this function could be wrong:
static __inline__ uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *c){
return _mulx_u64(a, b, &c);
}
It should probably be:
static __inline__ uint64_t mulhilo64(uint64_t a, uint64_t b, uint64_t *c){
return _mulx_u64(a, b, c);
} // ^
Note that compiling with warnings enabled (e.g. gcc -Wall ...
) helps to catch simple mistakes like this.