CUDA kernel 10x slower when operating on cudaMallocManaged memory even when prefetched

#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

volatile double dummy = 0;

__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
  int start = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = start; i < N; i += stride) {
    output[i] = x[i] * y[i];
  }
}


int main()
{
  MyTimer timer;
  srand(time(NULL));
  HelloWorld<<<1,1>>>();

  timer.startCounter();
  int N = 2000 * 2000;
  float* h_a = new float[N];
  float* h_b = new float[N];
  float* h_c = new float[N];
  float* h_res = new float[N];
  for (int i = 0; i < N; i++) {
    h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_c[i] = h_a[i] * h_b[i];
  }
  dummy = timer.getCounterMsPrecise();

  timer.startCounter();
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, N * sizeof(float));
  cudaMalloc(&d_b, N * sizeof(float));
  cudaMalloc(&d_c, N * sizeof(float));
  dummy = timer.getCounterMsPrecise();
  cout << "cudaMalloc cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);  
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "H2D copy cost = " << dummy << "\n";
  
  timer.startCounter();
  constexpr int GRID_DIM = 256;
  constexpr int BLOCK_DIM = 256;
  multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "kernel cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";

  for (int i = 0; i < N; i++) if (h_res[i] != h_c[i]) {
    cout << "error\n";
    exit(1);
  }

  return 0;
}

If I use normal cudaMalloc, the result is

Hello world
cudaMalloc cost = 0.599463
H2D copy cost = 5.16785
kernel cost = 0.109068
D2H copy cost = 7.18768

but if I use cudaMallocManaged, it becomes

Hello world
cudaMalloc cost = 0.116722
H2D copy cost = 8.26673
kernel cost = 1.70356
D2H copy cost = 6.8841

Why is there such a big performance drop? The code has manually copied the memory to device side, so shouldn't it be exactly the same as regular cudaMalloc-ed device memory?

Solution

When using managed memory, "prefetching" does not mean use of cudaMemcpy. I don't recommend use of cudaMemcpy with managed memory. You won't find any training materials that suggest that, and furthermore it will not necessarily do what you think.

To prefetch data in a demand-paged managed memory (also called unified memory, or UM) regime, you should actually use cudaMemPrefetchAsync. When I do that, I observe no significant difference in performance between the two cases. For a sensible comparison, I had to refactor your code somewhat:

$ cat t2230.cu
#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
#include <iostream>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

volatile double dummy = 0;

__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
  int start = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = start; i < N; i += stride) {
    output[i] = x[i] * y[i];
  }
}


int main()
{
  MyTimer timer;
  srand(time(NULL));
  HelloWorld<<<1,1>>>();
  int N = 2000 * 2000;
  timer.startCounter();
  float *d_a, *d_b, *d_c;
#ifdef USE_MANAGED
  cudaMallocManaged(&d_a, N * sizeof(float));
  cudaMallocManaged(&d_b, N * sizeof(float));
  cudaMallocManaged(&d_c, N * sizeof(float));
  for (int i = 0; i < N; i++) {
    d_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    d_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    d_c[i] = 0.f;
  }
  cudaMemPrefetchAsync(d_a, N*sizeof(float), 0);
  cudaMemPrefetchAsync(d_b, N*sizeof(float), 0);
  cudaMemPrefetchAsync(d_c, N*sizeof(float), 0);
#else
  float* h_a = new float[N];
  float* h_b = new float[N];
  float* h_res = new float[N];
  for (int i = 0; i < N; i++) {
    h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
  }
  cudaMalloc(&d_a, N * sizeof(float));
  cudaMalloc(&d_b, N * sizeof(float));
  cudaMalloc(&d_c, N * sizeof(float));
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
#endif
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "alloc/H2D cost = " << dummy << "\n";
  constexpr int GRID_DIM = 80;
  constexpr int BLOCK_DIM = 1024;

  timer.startCounter();
  multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "kernel cost = " << dummy << "\n";
  float *res = d_c;
  float *a = d_a;
  float *b = d_b;
#ifndef USE_MANAGED
  timer.startCounter();
  cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";
  res = h_res;
  a = h_a;
  b = h_b;
#endif

  for (int i = 0; i < N; i++) if (res[i] != (a[i]*b[i])) {
    cout << "error\n";
    exit(1);
  }
  return 0;
}
$ nvcc -o t2230 t2230.cu
$ CUDA_VISIBLE_DEVICES="0" ./t2230
Hello world
alloc/H2D cost = 453.012
kernel cost = 0.109507
D2H copy cost = 8.04054
$ nvcc -o t2230 t2230.cu -DUSE_MANAGED
$ CUDA_VISIBLE_DEVICES="0" ./t2230
Hello world
alloc/H2D cost = 411.502
kernel cost = 0.101654
$

(V100, CUDA 11.4)

Note that this assumes you are in a demand-paged UM regime. If you are not in a demand-paged regime (e.g. on a Maxwell or Kepler device, or on windows, or on Jetson, currently), then you would not use cudaMemPrefetchAsync, and the data migration is inextricably linked to the kernel launch. Also make note of the use of CUDA_VISIBLE_DEVICES. In a multi-GPU system, UM can have a variety of different behaviors depending on system topology as well as the GPUs in the system. This can make an apples-to-apples comparison difficult.

At the end, I did not do prefetching of the data back to the host, if you want compare that activity, you've already been given some instruction.