I'm currently trying to implement an XOR_SHIFT Random Number Generator using AVX2, it's actually quite easy and very fast. However I need to be able to specify a range. This usually requires modulo.
This is a major problem for me for 2 reasons:
Are the any significantly faster ways to do modulo using AVX2?
Non Vector Code:
std::srand(std::time(nullptr));
std::mt19937_64 e(std::rand());
uint32_t seed = static_cast<uint32_t>(e());
for (; i != end; ++i)
{
seed ^= (seed << 13u);
seed ^= (seed >> 7u);
seed ^= (seed << 17u);
arr[i] = static_cast<T>(low + (seed % ((up + 1u) - low)));
}//End for
Vectorized:
constexpr uint32_t thirteen = 13u;
constexpr uint32_t seven = 7u;
constexpr uint32_t seventeen = 17u;
const __m256i _one = _mm256_set1_epi32(1);
const __m256i _lower = _mm256_set1_epi32(static_cast<uint32_t>(low));
const __m256i _upper = _mm256_set1_epi32(static_cast<uint32_t>(up));
__m256i _temp = _mm256_setzero_si256();
__m256i _res = _mm256_setzero_si256();
__m256i _seed = _mm256_set_epi32(
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e()),
static_cast<uint32_t>(e())
);
for (; (i + 8uz) < end; ++i)
{
//Generate Random Numbers
_temp = _mm256_slli_epi32(_seed, thirteen);
_seed = _mm256_xor_si256(_seed, _temp);
_temp = _mm256_srai_epi32(_seed, seven);
_seed = _mm256_xor_si256(_seed, _temp);
_temp = _mm256_slli_epi32(_seed, seventeen);
_seed = _mm256_xor_si256(_seed, _temp);
//Narrow
_temp = _mm256_add_epi32(_upper, _one);
_temp = _mm256_sub_epi32(_temp, _lower);
_temp = _mm256_rem_epu32(_seed, _temp); //Comment this line out for a massive speed up but incorrect results
_res = _mm256_add_epi32(_lower, _temp);
_mm256_store_si256((__m256i*) &arr[i], _res);
}//End for
If you range is smaller than ~16.7 million, and you don’t need cryptography-grade quality of the distribution, an easy and relatively fast method of narrowing these random numbers is FP32 math.
Here’s an example, untested. The function below takes integer vector with random bits, and converts these bits into integer numbers in [ 0 .. range - 1 ] interval.
// Ideally, make sure this function is inlined,
// by applying __forceinline for vc++ or __attribute__((always_inline)) for gcc/clang
inline __m256i narrowRandom( __m256i bits, int range )
{
assert( range > 1 );
// Convert random bits into FP32 number in [ 1 .. 2 ) interval
const __m256i mantissaMask = _mm256_set1_epi32( 0x7FFFFF );
const __m256i mantissa = _mm256_and_si256( bits, mantissaMask );
const __m256 one = _mm256_set1_ps( 1 );
__m256 val = _mm256_or_ps( _mm256_castsi256_ps( mantissa ), one );
// Scale the number from [ 1 .. 2 ) into [ 0 .. range ),
// the formula is ( val * range ) - range
const __m256 rf = _mm256_set1_ps( (float)range );
val = _mm256_fmsub_ps( val, rf, rf );
// Convert to integers
// The instruction below always truncates towards 0 regardless on MXCSR register.
// If you want ranges like [ -10 .. +10 ], use _mm256_add_epi32 afterwards
return _mm256_cvttps_epi32( val );
}
When inlined, it should compile into 4 instructions, vpand
, vorps
, vfmsub132ps
, vcvttps2dq
Probably an order of magnitude faster than _mm256_rem_epu32
in your example.