Search code examples
cudanvidiansight

Achieved Occupancy column is not shown is Nsight Profiling result


I have faced a problem that is very weird to me. I can not see the achieved occupancy column in Nsight Performance Analysis output. I am using Geforce 920M GPU, NVIDIA driver of version 425.31, Nsight version of 6.0.0.18296 and visual studio 2017. The Nsight's version is compatible with driver's. Can anyone help me out? I have quite no idea that why this happens.

enter image description here

I use Nsight performance analysis with CUDA trace checked as bellow:

enter image description here

I also used Visual Profiler but the achieved occupancy could not be seen there, too.enter image description here And the GPU examination gives out an error: enter image description here

  • Note that as talonmies mentioned the error above was due to not running profiler in administrator mode. And solved but achieved occupancy is still not shown.

And here is my code:

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <math.h>
#include <iostream>
#define MAX_HISTORGRAM_NUMBER 10000
#define ARRAY_SIZE 102400000

#define CHUNK_SIZE 100
#define THREAD_COUNT 8
#define SCALER 80
cudaError_t histogramWithCuda(int *a, unsigned long long int *c);

__global__ void histogramKernelSingle(unsigned long long int *c, int *a)
{
    unsigned long long int worker =  blockIdx.x*blockDim.x + threadIdx.x;
    unsigned long long int start = worker * CHUNK_SIZE;
    unsigned long long int end = start + CHUNK_SIZE;
    for (int ex = 0; ex < SCALER; ex++)
        for (long long int i = start; i < end; i++)
        {
            if (i < ARRAY_SIZE)
                atomicAdd(&c[a[i]], 1);
            else
            {
                break;
            }
        }
}

int main()
{
        int* a = (int*)malloc(sizeof(int)*ARRAY_SIZE);
        unsigned long long int* c = (unsigned long long int*)malloc(sizeof(unsigned long long int)*MAX_HISTORGRAM_NUMBER);
        for (unsigned long long i = 0; i < ARRAY_SIZE;i++)
            a[i] = rand() % MAX_HISTORGRAM_NUMBER;
        for (unsigned long long i = 0; i < MAX_HISTORGRAM_NUMBER; i++)
            c[i] = 0;

    // Add vectors in parallel.
        double start_time = omp_get_wtime();
        cudaError_t cudaStatus=histogramWithCuda(a,c);
        double end_time = omp_get_wtime();
        std::cout << end_time - start_time;
   // = 
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addWithCuda failed!");
        return 1;
    }
    
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.
    cudaStatus = cudaDeviceReset();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceReset failed!");
        return 1;
    }
    unsigned long long int R = 0;
    for (int i = 0; i < MAX_HISTORGRAM_NUMBER; i++)
    {
        R += c[i];
        //printf("%d    ", c[i]);
    }
    printf("\nCORRECT:%ld   ", R/(SCALER));
    return 0;
}

// Helper function for using CUDA to add vectors in parallel.
cudaError_t histogramWithCuda(int *a, unsigned long long int *c)
{
    int *dev_a = 0;
    unsigned long long int *dev_c = 0;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_c, MAX_HISTORGRAM_NUMBER * sizeof(unsigned long long int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_a, ARRAY_SIZE * sizeof(int));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }


    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_a, a, ARRAY_SIZE * sizeof(int), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    // Launch a kernel on the GPU with one thread for each element.
    //// BLOCK CALCULATOR HERE
    

    ////BLOCK CALCULATOR HERE
    
    histogramKernelSingle << < ARRAY_SIZE / (THREAD_COUNT*CHUNK_SIZE), THREAD_COUNT>> > (dev_c, dev_a);
    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }
    
    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }

    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(c, dev_c, MAX_HISTORGRAM_NUMBER * sizeof(unsigned long long int), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    
Error:
    cudaFree(dev_c);
    cudaFree(dev_a);
    return cudaStatus;
}

Thanks in advance.


Solution

  • Achieved Occupancy is only captured in the Profile Activity. The Trace Activity does not support capturing GPU performance counters. Achieved Occupancy is sm__active_warps_sum / sm__actice_cycles_sum / SM__MAX_WARPS * 100.

    Nsight Visual Studio Edition

    The Trace Activity cannot collect Achieved Occupancy. Run the command Nsight | Start Performance Analysis ... and in the Activity window select Profile CUDA Application (not Trace Application). The default Profile CUDA Application contains the experiment Achieved Occupancy.

    NVIDIA Visual Profiler

    In NVVP ensure that you are collecting GPU performance counters. The default activity will collect the timeline but will not collect GPU events.

    Run | Generate Timeline will not collect Achieved Occupancy Run | Analyze Application will collect Achieved Occupancy

    If you continue to have issues then you may have an issue with permissions on the system. Please try collecting another set of performance counters using Nsight Profile CUDA Application or NVVP | Collect Metrics and Events...