Search code examples
cudagpu-warp

CUDA Warp Synchronization Problem


In generalizing a kernel thats shifts the values of a 2D array one space to the right (wrapping around the row boundaries), I have come across a warp synchronization problem. The full code is attached and included below.

The code is meant to work for arbitrary array width, array height, number of thread blocks, and number of threads per block. When choosing a thread size of 33 (i.e. one more thread than a full warp), the 33rd thread doesn't synchronize with __syncthreads() is called. This causes problems with the output data. The problem is only present when there is more than one warp, and the width of the array is more than the number of threads (e.g. with width=35 and 34 threads).

The following is a downsized example of what happens (in reality the array would need to have more elements for the kernel to produce the error).

Initial array:

0 1 2 3 4 
5 6 7 8 9

Expected Result:

4 0 1 2 3
9 5 6 7 8

Kernel Produces:

4 0 1 2 3
8 5 6 7 8

The first line is done correctly (for each block if there are more than one), with all subsequent lines having the second last value repeated. I have tested this one two different cards (8600GT and GTX280) and get the same results. I would like to know if this is just a bug with my kernel, or a problem that can't be fixed by adjusting my code?

The full source file is included below.

Thank you.

#include <cstdio>
#include <cstdlib>

// A method to ensure all reads use the same logical layout.
inline __device__ __host__ int loc(int x, int y, int width)
{
  return y*width + x;
}

//kernel to shift all items in a 2D array one position to the right (wrapping around rows)
__global__ void shiftRight ( int* globalArray, int width, int height)
{
  int temp1=0;          //temporary swap variables
  int temp2=0;

  int blockRange=0;     //the number of rows that a single block will shift

  if (height%gridDim.x==0)  //logic to account for awkward array sizes
    blockRange = height/gridDim.x;
  else
    blockRange = (1+height/gridDim.x);

  int yStart = blockIdx.x*blockRange;
  int yEnd = yStart+blockRange; //the end condition for the y-loop
  yEnd = min(height,yEnd);              //make sure that the array doesn't go out of bounds

  for (int y = yStart; y < yEnd ; ++y)
  {
    //do the first read so the swap variables are loaded for the x-loop
    temp1 = globalArray[loc(threadIdx.x,y,width)];
    //Each block shifts an entire row by itself, even if there are more columns than threads
    for (int threadXOffset = threadIdx.x  ; threadXOffset < width ; threadXOffset+=blockDim.x)
    {
      //blockDim.x is added so that we store the next round of values
      //this has to be done now, because the next operation will
      //overwrite one of these values
      temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
      __syncthreads();  //sync before the write to ensure all the values have been read
      globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
      __syncthreads();  //sync after the write so ensure all the values have been written
      temp1 = temp2;        //swap the storage variables.
    }
    if (threadIdx.x == 0 && y == 0)
      globalArray[loc(12,2,width)]=globalArray[67];
  }
}


int main (int argc, char* argv[])
{
  //set the parameters to be used
  int width = 34;
  int height = 3;
  int threadsPerBlock=33;
  int numBlocks = 1;

  int memSizeInBytes = width*height*sizeof(int);

  //create the host data and assign each element of the array to equal its index
  int* hostData = (int*) malloc (memSizeInBytes);
  for (int y = 0 ; y < height ; ++y)
    for (int x = 0 ; x < width ; ++x)
      hostData [loc(x,y,width)] = loc(x,y,width);

  //create an allocate the device pointers
  int* deviceData;
  cudaMalloc ( &deviceData  ,memSizeInBytes);
  cudaMemset (  deviceData,0,memSizeInBytes);
  cudaMemcpy (  deviceData, hostData, memSizeInBytes, cudaMemcpyHostToDevice);
  cudaThreadSynchronize();

  //launch the kernel
  shiftRight<<<numBlocks,threadsPerBlock>>> (deviceData, width, height);
  cudaThreadSynchronize();

  //copy the device data to a host array
  int* hostDeviceOutput = (int*) malloc (memSizeInBytes);
  cudaMemcpy (hostDeviceOutput, deviceData, memSizeInBytes, cudaMemcpyDeviceToHost); 
  cudaFree (deviceData);

  //Print out the expected/desired device output
  printf("---- Expected Device Output ----\n");
  printf("   | ");
  for (int x = 0 ; x < width ; ++x)
    printf("%4d ",x);
  printf("\n---|-");
  for (int x = 0 ; x < width ; ++x)
    printf("-----");
  for (int y = 0 ; y < height ; ++y)
  {
    printf("\n%2d | ",y);
    for (int x = 0 ; x < width ; ++x)
      printf("%4d ",hostData[loc((x-1+width)%width,y,width)]);
  }
  printf("\n\n");

  printf("---- Actual Device Output ----\n");
  printf("   | ");
  for (int x = 0 ; x < width ; ++x)
    printf("%4d ",x);
  printf("\n---|-");
  for (int x = 0 ; x < width ; ++x)
    printf("-----");
  for (int y = 0 ; y < height ; ++y)
  {
    printf("\n%2d | ",y);
    for (int x = 0 ; x < width ; ++x)
      printf("%4d ",hostDeviceOutput[loc(x,y,width)]);
  }
  printf("\n\n");
}

Solution

  • Because not all threads are executing the same number of loop iterations, synchronisation is a problem! All threads should hit the same __syncthreads()-s all the time.

    I would suggest transforming your innermost for loop into something like this:

    for(int blockXOffset=0; blockXOffset < width; blockXOffset+=blockDim.x) {
      int threadXOffset=blockXOffset+threadIdx.x;
      bool isActive=(threadXOffset < width);
      if (isActive) temp2 = globalArray[loc((threadXOffset + blockDim.x)%width,y,width)];
      __syncthreads();
      if (isActive) globalArray[loc((threadXOffset +1)%width,y,width)] = temp1;
      __syncthreads();
      temp1 = temp2;
    }