Memory allocation is not permitted when running kernel with cudaLaunchCooperativeKernel and -rdc=true

An error "operation not permitted" is generated when running the following code. Is there anything I am missing? I'm running it with compute capabilities 7.5 and the command nvcc test.cu -rdc=true. It works without RDC.

#include <cooperative_groups.h>
#include <iostream>

__global__ void kernel() {
  void* x;
  cudaMalloc(&x, sizeof(int));
}

int main() {
  int dev = 0;
  int supportsCoopLaunch = 0;
  cudaDeviceGetAttribute(&supportsCoopLaunch, cudaDevAttrCooperativeLaunch, dev);
  if(supportsCoopLaunch == 0) {
    std::cout << "Device does not support cooperative launch, required to synchronize globally on the grid." << std::endl;
    return 0;
  }

  void* args[] = {};
  dim3 dimBlock(1, 1, 1);
  dim3 dimGrid(2, 1, 1);
  cudaError_t e = cudaLaunchCooperativeKernel((void*)kernel, dimGrid, dimBlock, args);
  if (e != cudaSuccess) {
    printf("CUDA runtime error %s\n", cudaGetErrorString(e));
  }
  cudaDeviceSynchronize();
  return 0;
}
``

Solution

Use malloc instead of cudaMalloc. This is probably due to a bug (see comments by Robert Crovella above).