Cuda with multiple GPUs: host and device report different device numbers

I am trying to utilize multiple GPUs on a HPC cluster for a simulation, but during my testing I found that it appears to run on a single GPU (device from kernel is ‘0’) although the host reports different devices for different threads.

Below is a minimal working example that illustrates the problem I have with my actual simulation.

#include <cuda_runtime.h>
#include <iostream>
#include <omp.h>

// Kernel to print the device ID from the GPU
__global__ void printGPUDeviceID() {
    int deviceID;
    cudaGetDevice(&deviceID);  // Get the current device ID
    printf("Device ID from the kernel: %d\n", deviceID);
}

int main() {
    // Get the number of available devices
    int num_devices;
    cudaGetDeviceCount(&num_devices);
    if (num_devices < 2) {
        std::cout << "This example requires at least two GPUs." << std::endl;
        return 1;
    }

    // Use OpenMP to create threads for each GPU
    #pragma omp parallel num_threads(num_devices)
    {
        int thread_id = omp_get_thread_num();  // Get the OpenMP thread ID
        int device_id = thread_id;             // Assign one device per thread

        // Set the current device for this thread
        cudaSetDevice(device_id);

        // Get and print the device ID from the host
        int deviceIDFromHost;
        cudaGetDevice(&deviceIDFromHost);
        printf("Device ID from the host (thread %d): %d\n", thread_id, deviceIDFromHost);

        // Launch a kernel to print the device ID from the GPU
        printGPUDeviceID<<<1, 1>>>();

        // Wait for the GPU to finish
        cudaDeviceSynchronize();

        // Check for any errors during kernel execution
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("CUDA error on device %d: %s\n", device_id, cudaGetErrorString(err));
        }
    }

    return 0;
}

Expected output

Device ID from the host (thread 0): 0
Device ID from the kernel: 0
Device ID from the host (thread 1): 1
Device ID from the kernel: 1

etc

Actual output

Device ID from the host (thread 0): 0
Device ID from the kernel: 0
Device ID from the host (thread 1): 1
Device ID from the kernel: 0

etc

What am I doing wrong?

Solution

When using the device runtime API, it is frequently necessary to specify relocatable device code with device linking.

For a single nvcc command, a shortcut for this specification is -rdc=true.

This is presumably necessary because the device code in your kernel code may need to be linked against a device code library that provides the device runtime API entry points.

It's already been confirmed via the comments that this fixes the issue. There are no code changes required to the code you have posted, only modification of the compile command, which you didn't post.

Nevertheless, here is a full example:

# cat t1.cu
#include <cuda_runtime.h>
#include <iostream>
#include <omp.h>

// Kernel to print the device ID from the GPU
__global__ void printGPUDeviceID() {
    int deviceID;
    cudaGetDevice(&deviceID);  // Get the current device ID
    printf("Device ID from the kernel: %d\n", deviceID);
}

int main() {
    // Get the number of available devices
    int num_devices;
    cudaGetDeviceCount(&num_devices);
    if (num_devices < 2) {
        std::cout << "This example requires at least two GPUs." << std::endl;
        return 1;
    }

    // Use OpenMP to create threads for each GPU
    #pragma omp parallel num_threads(num_devices)
    {
        int thread_id = omp_get_thread_num();  // Get the OpenMP thread ID
        int device_id = thread_id;             // Assign one device per thread

        // Set the current device for this thread
        cudaSetDevice(device_id);

        // Get and print the device ID from the host
        int deviceIDFromHost;
        cudaGetDevice(&deviceIDFromHost);
        printf("Device ID from the host (thread %d): %d\n", thread_id, deviceIDFromHost);

        // Launch a kernel to print the device ID from the GPU
        printGPUDeviceID<<<1, 1>>>();

        // Wait for the GPU to finish
        cudaDeviceSynchronize();

        // Check for any errors during kernel execution
        cudaError_t err = cudaGetLastError();
        if (err != cudaSuccess) {
            printf("CUDA error on device %d: %s\n", device_id, cudaGetErrorString(err));
        }
    }

    return 0;
}
# nvcc -Xcompiler -fopenmp t1.cu -o t1 -lgomp -rdc=true 
# OMP_NUM_THREADS=2 ./t1
Device ID from the host (thread 0): 0
Device ID from the host (thread 1): 1
Device ID from the kernel: 1
Device ID from the kernel: 0
#