How to reduce code duplication between OpenCL kernels?

I have several similar kernels to generate random data and store it in global memory. I'm always using the same algorithm to randomize, but due to variable scope issues (I need to keep track of data) I fail to avoid severe code duplications.

Are there any ways to avoid this? Generating random data in OpenCL seems a fairly standard task, but it goes against any good coding standards to have this level of code duplication. For example, here are two of my kernels:

////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to AWGN channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterAWGN(__global double* d_Rand, 
                  __global int* seeds,
                              __global long* inputcw,
                  int nPerRng, float sigma)
{
    int globalID = get_global_id(0);
    double c = 2.0/(sigma*sigma);

    int iState, iState1, iStateM, iOut;
    unsigned int mti, mti1, mtiM, x;
    unsigned int mt[MT_NN]; 

    //Initialize current state
    mt[0] = seeds[globalID];
    for (iState = 1; iState < MT_NN; iState++)
        mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;

    iState = 0;
    mti1 = mt[0];
    for (iOut = 0; iOut < nPerRng; iOut=iOut+2) {
        iState1 = iState + 1;
        iStateM = iState + MT_MM;
        if(iState1 >= MT_NN) iState1 -= MT_NN;
        if(iStateM >= MT_NN) iStateM -= MT_NN;
        mti  = mti1;
        mti1 = mt[iState1];
        mtiM = mt[iStateM];

        // MT recurrence
        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
        x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

        mt[iState] = x;
        iState = iState1;

        //Tempering transformation
        x ^= (x >> MT_SHIFT0);
        x ^= (x << MT_SHIFTB) & mask_b;
        x ^= (x << MT_SHIFTC) & mask_c;
        x ^= (x >> MT_SHIFT1);

        double u1 = ((double)x + 1.0f) / 4294967296.0f;

        iState1 = iState + 1;
        iStateM = iState + MT_MM;
        if(iState1 >= MT_NN) iState1 -= MT_NN;
        if(iStateM >= MT_NN) iStateM -= MT_NN;
        mti  = mti1;
        mti1 = mt[iState1];
        mtiM = mt[iStateM];

        // MT recurrence
        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
        x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

        mt[iState] = x;
        iState = iState1;

        //Tempering transformation
        x ^= (x >> MT_SHIFT0);
        x ^= (x << MT_SHIFTB) & mask_b;
        x ^= (x << MT_SHIFTC) & mask_c;
        x ^= (x >> MT_SHIFT1);

        double u2 = ((double)x + 1.0f) / 4294967296.0f;

        double r = sqrt(-2.0f * log(u1));
        double phi = 2 * PI * u2;

        u1 = r * cos(phi);
        u1 = inputcw[iOut]+sigma*u1;
        u1=1/(1+exp(-c*u1));
        d_Rand[globalID * nPerRng + iOut]=log((1-u1)/u1);
        if (iOut!=nPerRng-1) {
            u2 = r * sin(phi);
            u2 = inputcw[iOut+1]+sigma*u2;
            u2=1/(1+exp(-c*u2));
            u2=log((1-u2)/u2);
            d_Rand[globalID * nPerRng + iOut+1]=u2;
        }
    }
}

and

////////////////////////////////////////////////////////////////////////////////
// OpenCL Kernel for Mersenne Twister RNG -- applied to BSC channel
////////////////////////////////////////////////////////////////////////////////
__kernel void MersenneTwisterBSC(__global double* d_Rand, 
                  __global int* seeds,
                              __global long* inputcw,
                  int nPerRng, float flipProb)
{
    int globalID = get_global_id(0);

    int iState, iState1, iStateM, iOut;
    unsigned int mti, mti1, mtiM, x;
    unsigned int mt[MT_NN]; 

    //Initialize current state
    mt[0] = seeds[globalID];
    for (iState = 1; iState < MT_NN; iState++)
        mt[iState] = (1812433253U*(mt[iState-1]^(mt[iState-1]>>30))+iState) & MT_WMASK;

    iState = 0;
    mti1 = mt[0];
    for (iOut = 0; iOut < nPerRng; iOut=iOut+1) {
        iState1 = iState + 1;
        iStateM = iState + MT_MM;
        if(iState1 >= MT_NN) iState1 -= MT_NN;
        if(iStateM >= MT_NN) iStateM -= MT_NN;
        mti  = mti1;
        mti1 = mt[iState1];
        mtiM = mt[iStateM];

        // MT recurrence
        x = (mti & MT_UMASK) | (mti1 & MT_LMASK);
        x = mtiM ^ (x >> 1) ^ ((x & 1) ? matrix_a : 0);

        mt[iState] = x;
        iState = iState1;

        //Tempering transformation
        x ^= (x >> MT_SHIFT0);
        x ^= (x << MT_SHIFTB) & mask_b;
        x ^= (x << MT_SHIFTC) & mask_c;
        x ^= (x >> MT_SHIFT1);

        double c = log((1-flipProb)/flipProb);
        double u = ((double)x + 1.0f) / 4294967296.0f;
        u = (2*isless(u,flipProb)-1)*inputcw[iOut]*c;
        d_Rand[globalID * nPerRng + iOut]=u;
    }
}

Are there any ways, tricks or methods to avoid this? Subroutines seem unable to make proper use of the variables (especially mt), so I didn't manage to cut it down in the way other languages would allow to.

Or should I just accept this as a necessary evil in OpenCL and keep managing 10 different kernels this way?

Solution

At Khronos' site, it says

OpenCL programs may also contain auxiliary functions and constant data that can be used by __kernel functions.

An example to generate random number between 0.0f and 1.0f per thread:

Core function to iterate a seed:

uint wang_hash(uint seed)
{
   seed = (seed ^ 61) ^ (seed >> 16);
   seed *= 9;
   seed = seed ^ (seed >> 4);
   seed *= 0x27d4eb2d;
   seed = seed ^ (seed >> 15);
   return seed;
}

Initialization and iteration of each threads seed:

// id=thread id, rnd=seed array
void wang_rnd_init(__global unsigned int * rnd,int id)                
{
     uint maxint=0;
     maxint--;  // could be a 0xFFFFFFFF
     uint rndint=wang_hash(id);
     rnd[id]=rndint;
}

// id=thread id, rnd=seed array
float wang_rnd(__global unsigned int * rnd,int id)                
{
     uint maxint=0;
     maxint--;  // could be a 0xFFFFFFFF
     uint rndint=wang_hash(rnd[id]);
     rnd[id]=rndint;
     return ((float)rndint)/(float)maxint;
}

Usage in a random grayscale color pixel generator kernel:

__kernel void rnd_1(__global unsigned int * rnd, __global int *rgba)
{
      int id=get_global_id(0);
      float rgba_register=wang_rnd(rnd,id);
      rgba[id] = ((int)(rgba_register * 255) << 24) | ((int)(rgba_register * 255) << 16) | ((int)(rgba_register * 255) << 8) | ((int)(rgba_register * 255));
}

and wang_rnd() can be used in other kernels without defining it twice if they are in same compiled context, same as putting all relevant kernels and functions in the same file to be compiled.

Auxilliary functions are not limited to registers and global memory. They can take local and constant memory parameters too. Since they are working with device side memory mainly, they can take and return structs too.