Search code examples
c++optimizationcudamallocgpu

CUDA kernel 10x slower when operating on cudaMallocManaged memory even when prefetched


#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;

class MyTimer {
    std::chrono::time_point<std::chrono::system_clock> start;

public:
    void startCounter() {
        start = std::chrono::system_clock::now();
    }

    int64_t getCounterNs() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
    }

    int64_t getCounterMs() {
        return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
    }

    double getCounterMsPrecise() {
        return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                / 1000000.0;
    }
};

__global__
void HelloWorld()
{
  printf("Hello world\n");
}

volatile double dummy = 0;

__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
  int start = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;

  for (int i = start; i < N; i += stride) {
    output[i] = x[i] * y[i];
  }
}


int main()
{
  MyTimer timer;
  srand(time(NULL));
  HelloWorld<<<1,1>>>();

  timer.startCounter();
  int N = 2000 * 2000;
  float* h_a = new float[N];
  float* h_b = new float[N];
  float* h_c = new float[N];
  float* h_res = new float[N];
  for (int i = 0; i < N; i++) {
    h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
    h_c[i] = h_a[i] * h_b[i];
  }
  dummy = timer.getCounterMsPrecise();

  timer.startCounter();
  float *d_a, *d_b, *d_c;
  cudaMalloc(&d_a, N * sizeof(float));
  cudaMalloc(&d_b, N * sizeof(float));
  cudaMalloc(&d_c, N * sizeof(float));
  dummy = timer.getCounterMsPrecise();
  cout << "cudaMalloc cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);  
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "H2D copy cost = " << dummy << "\n";
  
  timer.startCounter();
  constexpr int GRID_DIM = 256;
  constexpr int BLOCK_DIM = 256;
  multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "kernel cost = " << dummy << "\n";

  timer.startCounter();
  cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  dummy = timer.getCounterMsPrecise();
  cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";

  for (int i = 0; i < N; i++) if (h_res[i] != h_c[i]) {
    cout << "error\n";
    exit(1);
  }

  return 0;
}

If I use normal cudaMalloc, the result is

Hello world
cudaMalloc cost = 0.599463
H2D copy cost = 5.16785
kernel cost = 0.109068
D2H copy cost = 7.18768

but if I use cudaMallocManaged, it becomes

Hello world
cudaMalloc cost = 0.116722
H2D copy cost = 8.26673
kernel cost = 1.70356
D2H copy cost = 6.8841

Why is there such a big performance drop? The code has manually copied the memory to device side, so shouldn't it be exactly the same as regular cudaMalloc-ed device memory?


Solution

  • When using managed memory, "prefetching" does not mean use of cudaMemcpy. I don't recommend use of cudaMemcpy with managed memory. You won't find any training materials that suggest that, and furthermore it will not necessarily do what you think.

    To prefetch data in a demand-paged managed memory (also called unified memory, or UM) regime, you should actually use cudaMemPrefetchAsync. When I do that, I observe no significant difference in performance between the two cases. For a sensible comparison, I had to refactor your code somewhat:

    $ cat t2230.cu
    #include <cuda_runtime.h>
    #include <string>
    #include <chrono>
    #include <random>
    #include <iostream>
    using namespace std;
    
    class MyTimer {
        std::chrono::time_point<std::chrono::system_clock> start;
    
    public:
        void startCounter() {
            start = std::chrono::system_clock::now();
        }
    
        int64_t getCounterNs() {
            return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
        }
    
        int64_t getCounterMs() {
            return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
        }
    
        double getCounterMsPrecise() {
            return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
                    / 1000000.0;
        }
    };
    
    __global__
    void HelloWorld()
    {
      printf("Hello world\n");
    }
    
    volatile double dummy = 0;
    
    __global__
    void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
    {
      int start = blockIdx.x * blockDim.x + threadIdx.x;
      int stride = blockDim.x * gridDim.x;
    
      for (int i = start; i < N; i += stride) {
        output[i] = x[i] * y[i];
      }
    }
    
    
    int main()
    {
      MyTimer timer;
      srand(time(NULL));
      HelloWorld<<<1,1>>>();
      int N = 2000 * 2000;
      timer.startCounter();
      float *d_a, *d_b, *d_c;
    #ifdef USE_MANAGED
      cudaMallocManaged(&d_a, N * sizeof(float));
      cudaMallocManaged(&d_b, N * sizeof(float));
      cudaMallocManaged(&d_c, N * sizeof(float));
      for (int i = 0; i < N; i++) {
        d_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
        d_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
        d_c[i] = 0.f;
      }
      cudaMemPrefetchAsync(d_a, N*sizeof(float), 0);
      cudaMemPrefetchAsync(d_b, N*sizeof(float), 0);
      cudaMemPrefetchAsync(d_c, N*sizeof(float), 0);
    #else
      float* h_a = new float[N];
      float* h_b = new float[N];
      float* h_res = new float[N];
      for (int i = 0; i < N; i++) {
        h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
        h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
      }
      cudaMalloc(&d_a, N * sizeof(float));
      cudaMalloc(&d_b, N * sizeof(float));
      cudaMalloc(&d_c, N * sizeof(float));
      cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
      cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
    #endif
      cudaDeviceSynchronize();
      dummy = timer.getCounterMsPrecise();
      cout << "alloc/H2D cost = " << dummy << "\n";
      constexpr int GRID_DIM = 80;
      constexpr int BLOCK_DIM = 1024;
    
      timer.startCounter();
      multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
      cudaDeviceSynchronize();
      dummy = timer.getCounterMsPrecise();
      cout << "kernel cost = " << dummy << "\n";
      float *res = d_c;
      float *a = d_a;
      float *b = d_b;
    #ifndef USE_MANAGED
      timer.startCounter();
      cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
      cudaDeviceSynchronize();
      dummy = timer.getCounterMsPrecise();
      cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";
      res = h_res;
      a = h_a;
      b = h_b;
    #endif
    
      for (int i = 0; i < N; i++) if (res[i] != (a[i]*b[i])) {
        cout << "error\n";
        exit(1);
      }
      return 0;
    }
    $ nvcc -o t2230 t2230.cu
    $ CUDA_VISIBLE_DEVICES="0" ./t2230
    Hello world
    alloc/H2D cost = 453.012
    kernel cost = 0.109507
    D2H copy cost = 8.04054
    $ nvcc -o t2230 t2230.cu -DUSE_MANAGED
    $ CUDA_VISIBLE_DEVICES="0" ./t2230
    Hello world
    alloc/H2D cost = 411.502
    kernel cost = 0.101654
    $
    

    (V100, CUDA 11.4)

    Note that this assumes you are in a demand-paged UM regime. If you are not in a demand-paged regime (e.g. on a Maxwell or Kepler device, or on windows, or on Jetson, currently), then you would not use cudaMemPrefetchAsync, and the data migration is inextricably linked to the kernel launch. Also make note of the use of CUDA_VISIBLE_DEVICES. In a multi-GPU system, UM can have a variety of different behaviors depending on system topology as well as the GPUs in the system. This can make an apples-to-apples comparison difficult.

    At the end, I did not do prefetching of the data back to the host, if you want compare that activity, you've already been given some instruction.