Using peered GPUs in a single stream

In my current project I use GPUs for signal processing and visualization. I'm already using streams to allow for asynchronous operation. The signal is processed in frames and for each frame the processing steps in a stream are as following

memcpy to device
signal conditioning
image processing
visualization

Right now the steps are happening on a single GPU, however my machine has a Multi-GPU card (GeForce GTX 690) and I'd like to distribute the operation between the two devices. Basically I'd like to perform steps 1 and 2 on device A and steps 3 and 4 on device B, while the operations 1, 2, 3 and 4 are still executed as a single asynchronous stream. The desired outcome is a streaming layout that looks like this

Device A Stream a 1 2       1 2  ...
         Stream b      1 2  ...
Device B Stream a    3 4       3 4 ...
         Stream b         3 4  ...

How can I do this?

Solution

My previous attempt was not correct, because a stream is associated with the device it was created on. So I think the most direct answer to your question as posed in the title is "it can't be done". You cannot create a single stream and issue commands to more than one GPU from it. From here:

Stream and Event Behavior

A kernel launch or memory copy will fail if it is issued to a stream that is not associated to the current device

However while researching it, I noted that events are a suggested way to synchronize two streams on two different devices:

cudaStreamWaitEvent() will succeed even if the input stream and input event are associated to different devices. cudaStreamWaitEvent() can therefore be used to synchronize multiple devices with each other.

So in that vein, I created the following code to illustrate this:

#include <stdio.h>
#define SIZE 32
#define K1VAL 5
#define K3VAL 3

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)


__global__ void kernel1(int *frame, int size){

  int idx = threadIdx.x + (blockDim.x * blockIdx.x);

  if (idx == 0){
   int *a = new int[10000]; // just to make this kernel take a while
   for (int i = 0; i<10000; i++)
     a[i] = 0;
   for (int i = 0; i < size; i++)
     frame[i] += K1VAL;
   }
}


__global__ void kernel3(int *frame, int size){

  int idx = threadIdx.x + (blockDim.x * blockIdx.x);
  if (idx == 0)
   for (int i = 0; i < size; i++)
     frame[i] -= K3VAL;
}

void set_device(int dev){
  int ldev;
  cudaSetDevice(dev);
  cudaGetDevice(&ldev);
  cudaCheckErrors("set device error");
  if (ldev != dev){
    printf("set device mismatch error\n");
    exit(1);
    }
}


int main(){

  int A=0;
  int B=1;
  int framesize = SIZE*sizeof(int);
  int *h_frame;
  int *d_frame_aA,  *d_frame_bB;
  int numdev = 0;
  cudaGetDeviceCount(&numdev);
  cudaCheckErrors("can't determine number of devices");
  if (numdev < 2){
   printf("not enough devices!\n");
   return 1;
   }
  set_device(A);
  cudaMalloc((void **) &d_frame_aA, framesize);  // stream_a
  cudaMemset(d_frame_aA, 0, framesize);
  set_device(B);
  cudaMalloc((void **) &d_frame_bB, framesize);  // stream_b
  cudaMemset(d_frame_bB, 0, framesize);
  cudaHostAlloc((void **) &h_frame, framesize, cudaHostAllocDefault);
  cudaCheckErrors("allocations failure");

  set_device(A);
  cudaStream_t stream_a, stream_b;
  cudaStreamCreate(&stream_a);
  cudaEvent_t absync;
  cudaEventCreate(&absync);
  set_device(B);
  cudaStreamCreate(&stream_b);
  cudaCheckErrors("stream creation failure");


  for (int i = 0; i < SIZE; i++)
    h_frame[i] = 0;

  set_device(A);
  cudaDeviceEnablePeerAccess(B, 0);
  set_device(B);
  cudaDeviceEnablePeerAccess(A, 0);
  cudaCheckErrors("enable peer access fail");


  set_device(A);
  cudaMemcpyAsync(d_frame_aA, h_frame, framesize, cudaMemcpyHostToDevice, stream_a);
  kernel1<<<1,1,0, stream_a>>>(d_frame_aA, SIZE);
  cudaCheckErrors("kernel1 fail");
  cudaMemcpyPeerAsync(d_frame_bB, B, d_frame_aA, A, framesize, stream_a );
  cudaCheckErrors("memcpypeer fail");
  cudaEventRecord(absync, stream_a);
  set_device(B);
  // comment out the next line to see the failure
  cudaStreamWaitEvent(stream_b, absync, 0);
  kernel3<<<1,1,0, stream_b>>>(d_frame_bB, SIZE);
  cudaCheckErrors("main sequence fail");


//  cudaCheckErrors("main sequence failure");

  cudaMemcpy(h_frame, d_frame_bB, framesize, cudaMemcpyDeviceToHost);
  cudaCheckErrors("results_a memcpy fail");
  for (int i = 0; i < SIZE; i++)
    if (h_frame[i] != (K1VAL - K3VAL)) {
      printf("results error\n");
      return 1;
      }
  printf("success\n");
  return 0;

}

If you run the code as-is, you should get a success message. If you comment out the line that forces stream b (on Device B) to wait on stream a (on Device A), then you'll see a results error message. So this demonstrates how to sync a stream on one device to a stream on another. Hope it helps. Sorry for the confusion on the first go-round.