Search code examples
cudaregister-allocation

Cuda single-thread scoped variables


Is it possible to make cuda use single-thread scoped variables (register or local memory) that are declared outside a function?

Most of my device functions needs to use the same variables.

Instead of passing the same variables as parameters to all my device funcitons, I would like to declare the variables outside the functions.

Is that possible?

My compute capacity is 1.2.

EDIT: An example:

__device__ __local__ int id;
__device__ __local__ int variable1 = 3;
__device__ __local__ int variable2 = 5;
__device__ __local__ int variable3 = 8;
__device__ __local__ int variable4 = 8;

//
__device__ int deviceFunction3() {
  variable1 += 8;
  variable4 += 7;
  variable2 += 1;
  variable3 += id;

  return variable1 + variable2 + variable3;
}

__device__ int deviceFunction2() {
  variable3 += 8; 
  variable1 += deviceFunction3();
  variable4 += deviceFunction3();

  return variable3 + variable4;
}

__device__ int deviceFunction1() {
  variable1 += id;
  variable4 += 2;
  variable2 += deviceFunction2();
  variable3 += variable2 + variable4;
  return variable1 + variable2 + variable3 + variable4;
}

// Kernel
__global__ void kernel(int *dev_a, int *dev_b, int *dev_c) {
  id = get_id();

  dev_c[id] = deviceFunction1();
}

The 3 device functions needs to manipulate the same variables. Each variable is calculated dependently for each thread. To my understanding, I cannot use the above code, because I cannot declare the variables so that they are local to each thread.

What I have to do instead is to declare all variables inside the kernel function, and then pass pointers to the variables to all the other functions:

__device__ int deviceFunction3(int* id,int* variable1,int* variable2,int* variable3,int* variable4) {
  *variable1 += 8;
  *variable4 += 7;
  *variable2 += 1;
  *variable3 += 2;

  return *variable1 + *variable2 + *variable3;
}

__device__ int deviceFunction2(int* id,int* variable1,int* variable2,int* variable3,int* variable4) {
  *variable3 += 8; 
  *variable1 += deviceFunction3(id,variable1,variable2,variable3,variable4);
  *variable4 += deviceFunction3(id,variable1,variable2,variable3,variable4);

  return *variable3 + *variable4;
}

__device__ int deviceFunction1(int* id,int* variable1,int* variable2,int* variable3,int* variable4) {
  *variable1 += *id;
  *variable4 += 2;
  *variable2 += deviceFunction2(id,variable1,variable2,variable3,variable4);
  *variable3 += *variable2 + *variable4;
  return *variable1 + *variable2 + *variable3 + *variable4;
}

// Kernel
__global__ void kernel(int *dev_a, int *dev_b, int *dev_c) {
  int id = get_id();
  int variable1 = 3;
  int variable2 = 5;
  int variable3 = 8;
  int variable4 = 8;

  dev_c[id] = deviceFunction1(&id,&variable1,&variable2,&variable3,&variable4);
}

Solution

  • Your usage case is a truly awful idea, and I wouldn't recommend that design pattern to my worst enemy. Leaving aside the merits of the code for a moment, as I hinted in comments, you can achieve the thread local variable scoping you desire by encapsulating the __device__ functions and variables they rely on in a structure, like this:

    struct folly
    {
        int id;
        int variable1;
        int variable2;
        int variable3;
        int variable4;
    
        __device__ folly(int _id) {
            id = _id;
            variable1 = 3;
            variable2 = 5;
            variable3 = 8;
            variable4 = 8;
        }
    
        __device__ int deviceFunction3() {
            variable1 += 8;
            variable4 += 7;
            variable2 += 1;
            variable3 += id;
    
            return variable1 + variable2 + variable3;
        }
    
        __device__ int deviceFunction2() {
            variable3 += 8; 
            variable1 += deviceFunction3();
            variable4 += deviceFunction3();
    
            return variable3 + variable4;
        }
    
        __device__ int deviceFunction1() {
            variable1 += id;
            variable4 += 2;
            variable2 += deviceFunction2();
            variable3 += variable2 + variable4;
            return variable1 + variable2 + variable3 + variable4;
        }
    };
    
    __global__ void kernel(int *dev_a, int *dev_b, int *dev_c) {
        int id = threadIdx.x + blockIdx.x * blockDim.x;
        folly do_calc(id);
        dev_c[id] = do_calc.deviceFunction1();
    }
    

    Also note that CUDA supports C++ style pass by reference, so any one of the device functions you have written in the second piece of code you posted could easily be written like this:

    __device__ int deviceFunction3(int & variable1, int & variable2, 
                                   int & variable3, int & variable4) 
    {
      variable1 += 8;
      variable4 += 7;
      variable2 += 1;
      variable3 += 2;
    
      return variable1 + variable2 + variable3;
    }
    

    which is far cleaner and easier to read.