What's the capacity of a CUDA stream (=queue)?

A CUDA stream is a queue of tasks: memory copies, event firing, event waits, kernel launches, callbacks...

But - these queues don't have infinite capacity. In fact, empirically, I find that this limit is not super-high, e.g. in the thousands, not millions.

My questions:

Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?
How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

Solution

Is the size/capacity of a CUDA stream fixed in terms of any kind of enqueued items, or does the capacity behave differently based on what kind of actions/tasks you enqueue?

The "capacity" behaves differently based on actions/tasks you enqueue.

Here is a demonstration:

If we enqueue a single host function/callback in the midst of a number of kernel calls, on a Tesla V100 on CUDA 11.4 I observe a "capacity" for ~1000 enqueued items. However if I alternate kernel calls and host functions, I observe a capacity for ~100 enqueued items.

// test case with alternating kernels and callbacks

$ cat t2042a.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>

#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}

// for blocking stream to wait for host signal
class Event {
 private:
  std::mutex mtx_condition_;
  std::condition_variable condition_;
  bool signalled = false;

 public:
  void Signal() {
    {
      std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
      signalled = true;
    }
    condition_.notify_all();
  }

  void Wait() {
    std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
    while (!signalled) {
      condition_.wait(lock);
    }
  }
};

void CUDART_CB block_op_host_fn(void* arg) {
  Event* evt = (Event*)arg;
  evt->Wait();
}

int main() {
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreate(&stream));

  int num_events = 60; // 50 is okay, 60 will hang
  std::vector<std::shared_ptr<Event>> event_vec;

  for (int i = 0; i < num_events; i++) {
    std::cout << "Queuing NoOp " << i << std::endl;
    NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
    std::cout << "Queued NoOp " << i << std::endl;

    event_vec.push_back(std::make_shared<Event>());
    cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());

    std::cout << "Queued block_op " << i << std::endl;
  }


  for (int i = 0; i < num_events; i++) {
    event_vec[i]->Signal();
  }

  // clean up
  CUDACHECK(cudaDeviceSynchronize());
  CUDACHECK(cudaStreamDestroy(stream));
  return 0;
}
$ nvcc -o t2042a t2042a.cu
$ ./t2042a
Queuing NoOp 0
Queued NoOp 0
Queued block_op 0
Queuing NoOp 1
Queued NoOp 1
Queued block_op 1
Queuing NoOp 2
Queued NoOp 2
Queued block_op 2
Queuing NoOp 3
Queued NoOp 3
Queued block_op 3
Queuing NoOp 4
Queued NoOp 4
Queued block_op 4
Queuing NoOp 5
Queued NoOp 5
Queued block_op 5
Queuing NoOp 6
Queued NoOp 6
Queued block_op 6
Queuing NoOp 7
Queued NoOp 7
Queued block_op 7
Queuing NoOp 8
Queued NoOp 8
Queued block_op 8
Queuing NoOp 9
Queued NoOp 9
Queued block_op 9
Queuing NoOp 10
Queued NoOp 10
Queued block_op 10
Queuing NoOp 11
Queued NoOp 11
Queued block_op 11
Queuing NoOp 12
Queued NoOp 12
Queued block_op 12
Queuing NoOp 13
Queued NoOp 13
Queued block_op 13
Queuing NoOp 14
Queued NoOp 14
Queued block_op 14
Queuing NoOp 15
Queued NoOp 15
Queued block_op 15
Queuing NoOp 16
Queued NoOp 16
Queued block_op 16
Queuing NoOp 17
Queued NoOp 17
Queued block_op 17
Queuing NoOp 18
Queued NoOp 18
Queued block_op 18
Queuing NoOp 19
Queued NoOp 19
Queued block_op 19
Queuing NoOp 20
Queued NoOp 20
Queued block_op 20
Queuing NoOp 21
Queued NoOp 21
Queued block_op 21
Queuing NoOp 22
Queued NoOp 22
Queued block_op 22
Queuing NoOp 23
Queued NoOp 23
Queued block_op 23
Queuing NoOp 24
Queued NoOp 24
Queued block_op 24
Queuing NoOp 25
Queued NoOp 25
Queued block_op 25
Queuing NoOp 26
Queued NoOp 26
Queued block_op 26
Queuing NoOp 27
Queued NoOp 27
Queued block_op 27
Queuing NoOp 28
Queued NoOp 28
Queued block_op 28
Queuing NoOp 29
Queued NoOp 29
Queued block_op 29
Queuing NoOp 30
Queued NoOp 30
Queued block_op 30
Queuing NoOp 31
Queued NoOp 31
Queued block_op 31
Queuing NoOp 32
Queued NoOp 32
Queued block_op 32
Queuing NoOp 33
Queued NoOp 33
Queued block_op 33
Queuing NoOp 34
Queued NoOp 34
Queued block_op 34
Queuing NoOp 35
Queued NoOp 35
Queued block_op 35
Queuing NoOp 36
Queued NoOp 36
Queued block_op 36
Queuing NoOp 37
Queued NoOp 37
Queued block_op 37
Queuing NoOp 38
Queued NoOp 38
Queued block_op 38
Queuing NoOp 39
Queued NoOp 39
Queued block_op 39
Queuing NoOp 40
Queued NoOp 40
Queued block_op 40
Queuing NoOp 41
Queued NoOp 41
Queued block_op 41
Queuing NoOp 42
Queued NoOp 42
Queued block_op 42
Queuing NoOp 43
Queued NoOp 43
Queued block_op 43
Queuing NoOp 44
Queued NoOp 44
Queued block_op 44
Queuing NoOp 45
Queued NoOp 45
Queued block_op 45
Queuing NoOp 46
Queued NoOp 46
Queued block_op 46
Queuing NoOp 47
Queued NoOp 47
Queued block_op 47
Queuing NoOp 48
Queued NoOp 48
Queued block_op 48
Queuing NoOp 49
Queued NoOp 49
Queued block_op 49
Queuing NoOp 50
Queued NoOp 50
Queued block_op 50
Queuing NoOp 51
Queued NoOp 51
Queued block_op 51
Queuing NoOp 52
Queued NoOp 52
Queued block_op 52
Queuing NoOp 53
Queued NoOp 53
Queued block_op 53
Queuing NoOp 54
Queued NoOp 54
Queued block_op 54
Queuing NoOp 55
Queued NoOp 55
Queued block_op 55
Queuing NoOp 56
Queued NoOp 56
Queued block_op 56
Queuing NoOp 57
^C
$

// test case with a single callback and many kernels

$ cat t2042.cu
#include <iostream>
#include <vector>
#include <mutex>
#include <condition_variable>
#include <cstdlib>
#define CUDACHECK(x) x
// empty kernel
__global__ void NoOpKernel() {}

// for blocking stream to wait for host signal
class Event {
 private:
  std::mutex mtx_condition_;
  std::condition_variable condition_;
  bool signalled = false;

 public:
  void Signal() {
    {
      std::lock_guard<decltype(mtx_condition_)> lock(mtx_condition_);
      signalled = true;
    }
    condition_.notify_all();
  }

  void Wait() {
    std::unique_lock<decltype(mtx_condition_)> lock(mtx_condition_);
    while (!signalled) {
      condition_.wait(lock);
    }
  }
};

void CUDART_CB block_op_host_fn(void* arg) {
  Event* evt = (Event*)arg;
  evt->Wait();
}

int main(int argc, char *argv[]) {
  cudaStream_t stream;
  CUDACHECK(cudaStreamCreate(&stream));

  int num_loops = 2000; // 50 is okay, 60 will hang
  int num_events = 0;
  std::vector<std::shared_ptr<Event>> event_vec;
  if (argc > 1) num_loops = atoi(argv[1]);

  for (int i = 0; i < num_loops; i++) {
    std::cout << "Queuing NoOp " << i << std::endl;
    NoOpKernel<<<1, 128, 0, stream>>>(); // HERE : is where it hangs
    std::cout << "Queued NoOp " << i << std::endl;
    if (i == 0){
      num_events++;
      event_vec.push_back(std::make_shared<Event>());
      cudaLaunchHostFunc(stream, block_op_host_fn, event_vec.back().get());

      std::cout << "Queued block_op " << i << std::endl;}
  }


  for (int i = 0; i < num_events; i++) {
    event_vec[i]->Signal();
  }

  // clean up
  CUDACHECK(cudaDeviceSynchronize());
  CUDACHECK(cudaStreamDestroy(stream));
  return 0;
}
$ nvcc -o t2042 t2042.cu
$ nvcc -o t2042 t2042.cu
$ ./t2042
... <snip>
Queuing NoOp 1019
Queued NoOp 1019
Queuing NoOp 1020
Queued NoOp 1020
Queuing NoOp 1021
Queued NoOp 1021
Queuing NoOp 1022
^C
$

(the code hangs when the queue becomes "full", and I terminate at that point with ctrl-C)

How can I determine this capacity other than enqueuing more and more stuff until I can no longer fit any?

Currently, there is no specification for this in CUDA, nor any explicit method to query for this at runtime.