#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
__global__
void HelloWorld()
{
printf("Hello world\n");
}
volatile double dummy = 0;
__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = start; i < N; i += stride) {
output[i] = x[i] * y[i];
}
}
int main()
{
MyTimer timer;
srand(time(NULL));
HelloWorld<<<1,1>>>();
timer.startCounter();
int N = 2000 * 2000;
float* h_a = new float[N];
float* h_b = new float[N];
float* h_c = new float[N];
float* h_res = new float[N];
for (int i = 0; i < N; i++) {
h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
h_c[i] = h_a[i] * h_b[i];
}
dummy = timer.getCounterMsPrecise();
timer.startCounter();
float *d_a, *d_b, *d_c;
cudaMalloc(&d_a, N * sizeof(float));
cudaMalloc(&d_b, N * sizeof(float));
cudaMalloc(&d_c, N * sizeof(float));
dummy = timer.getCounterMsPrecise();
cout << "cudaMalloc cost = " << dummy << "\n";
timer.startCounter();
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "H2D copy cost = " << dummy << "\n";
timer.startCounter();
constexpr int GRID_DIM = 256;
constexpr int BLOCK_DIM = 256;
multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "kernel cost = " << dummy << "\n";
timer.startCounter();
cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";
for (int i = 0; i < N; i++) if (h_res[i] != h_c[i]) {
cout << "error\n";
exit(1);
}
return 0;
}
If I use normal cudaMalloc
, the result is
Hello world
cudaMalloc cost = 0.599463
H2D copy cost = 5.16785
kernel cost = 0.109068
D2H copy cost = 7.18768
but if I use cudaMallocManaged
, it becomes
Hello world
cudaMalloc cost = 0.116722
H2D copy cost = 8.26673
kernel cost = 1.70356
D2H copy cost = 6.8841
Why is there such a big performance drop? The code has manually copied the memory to device side, so shouldn't it be exactly the same as regular cudaMalloc-ed
device memory?
When using managed memory, "prefetching" does not mean use of cudaMemcpy
. I don't recommend use of cudaMemcpy
with managed memory. You won't find any training materials that suggest that, and furthermore it will not necessarily do what you think.
To prefetch data in a demand-paged managed memory (also called unified memory, or UM) regime, you should actually use cudaMemPrefetchAsync
. When I do that, I observe no significant difference in performance between the two cases. For a sensible comparison, I had to refactor your code somewhat:
$ cat t2230.cu
#include <cuda_runtime.h>
#include <string>
#include <chrono>
#include <random>
#include <iostream>
using namespace std;
class MyTimer {
std::chrono::time_point<std::chrono::system_clock> start;
public:
void startCounter() {
start = std::chrono::system_clock::now();
}
int64_t getCounterNs() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count();
}
int64_t getCounterMs() {
return std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now() - start).count();
}
double getCounterMsPrecise() {
return std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::system_clock::now() - start).count()
/ 1000000.0;
}
};
__global__
void HelloWorld()
{
printf("Hello world\n");
}
volatile double dummy = 0;
__global__
void multiply(int N, float* __restrict__ output, const float* __restrict__ x, const float* __restrict__ y)
{
int start = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
for (int i = start; i < N; i += stride) {
output[i] = x[i] * y[i];
}
}
int main()
{
MyTimer timer;
srand(time(NULL));
HelloWorld<<<1,1>>>();
int N = 2000 * 2000;
timer.startCounter();
float *d_a, *d_b, *d_c;
#ifdef USE_MANAGED
cudaMallocManaged(&d_a, N * sizeof(float));
cudaMallocManaged(&d_b, N * sizeof(float));
cudaMallocManaged(&d_c, N * sizeof(float));
for (int i = 0; i < N; i++) {
d_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
d_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
d_c[i] = 0.f;
}
cudaMemPrefetchAsync(d_a, N*sizeof(float), 0);
cudaMemPrefetchAsync(d_b, N*sizeof(float), 0);
cudaMemPrefetchAsync(d_c, N*sizeof(float), 0);
#else
float* h_a = new float[N];
float* h_b = new float[N];
float* h_res = new float[N];
for (int i = 0; i < N; i++) {
h_a[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
h_b[i] = float(rand() % 1000000) / (rand() % 1000 + 1);
}
cudaMalloc(&d_a, N * sizeof(float));
cudaMalloc(&d_b, N * sizeof(float));
cudaMalloc(&d_c, N * sizeof(float));
cudaMemcpy(d_a, h_a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_b, h_b, N * sizeof(float), cudaMemcpyHostToDevice);
#endif
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "alloc/H2D cost = " << dummy << "\n";
constexpr int GRID_DIM = 80;
constexpr int BLOCK_DIM = 1024;
timer.startCounter();
multiply<<<GRID_DIM, BLOCK_DIM>>>(N, d_c, d_a, d_b);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "kernel cost = " << dummy << "\n";
float *res = d_c;
float *a = d_a;
float *b = d_b;
#ifndef USE_MANAGED
timer.startCounter();
cudaMemcpy(h_res, d_c, N * sizeof(float), cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
dummy = timer.getCounterMsPrecise();
cout << "D2H copy cost = " << timer.getCounterMsPrecise() << "\n";
res = h_res;
a = h_a;
b = h_b;
#endif
for (int i = 0; i < N; i++) if (res[i] != (a[i]*b[i])) {
cout << "error\n";
exit(1);
}
return 0;
}
$ nvcc -o t2230 t2230.cu
$ CUDA_VISIBLE_DEVICES="0" ./t2230
Hello world
alloc/H2D cost = 453.012
kernel cost = 0.109507
D2H copy cost = 8.04054
$ nvcc -o t2230 t2230.cu -DUSE_MANAGED
$ CUDA_VISIBLE_DEVICES="0" ./t2230
Hello world
alloc/H2D cost = 411.502
kernel cost = 0.101654
$
(V100, CUDA 11.4)
Note that this assumes you are in a demand-paged UM regime. If you are not in a demand-paged regime (e.g. on a Maxwell or Kepler device, or on windows, or on Jetson, currently), then you would not use cudaMemPrefetchAsync
, and the data migration is inextricably linked to the kernel launch. Also make note of the use of CUDA_VISIBLE_DEVICES
. In a multi-GPU system, UM can have a variety of different behaviors depending on system topology as well as the GPUs in the system. This can make an apples-to-apples comparison difficult.
At the end, I did not do prefetching of the data back to the host, if you want compare that activity, you've already been given some instruction.