False dependency issue for the Fermi architecture

I am trying to achieve "3-way overlapping" using 3 streams as in the examples in CUDA streams and concurrency webinar. But I couldn't achieve it.

I have Geforce GT 550M (Fermi Architecture with one copy engine) and I am using Windows 7 (64 bit).

Here is the code that I have written.

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions 

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x) 
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int dataSize = DATA_SIZE;
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];
    int *h_groundTruth = new int[dataSize];

    // Input population
    for(int i = 0; i < dataSize; i++)
        h_in[i] = 5;

    for(int i = 0; i < dataSize; i++)
        h_out[i] = 0;

    // CPU calculation for ground truth
    for(int i = 0; i < dataSize; i++)
        h_groundTruth[i] = h_in[i] * h_in[i];

    // Choose which GPU to run on, change this on a multi-GPU system.
    checkCudaErrors( cudaSetDevice(0) );

    int *d_in = 0;
    int *d_out = 0;
    int streamSize = dataSize / NUM_STREAMS;
    size_t memSize = dataSize * sizeof(int);
    size_t streamMemSize = memSize / NUM_STREAMS;

    checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
    checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

    // set kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamCreate(&streams[i]));

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, 0);

    // overlapped execution using version 2

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);
    }

    //cudaMemcpy(d_in, h_in, memSize, cudaMemcpyHostToDevice);

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        //kernel<<<nBlocks, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);
    }

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;
        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }



    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamSynchronize(streams[i]));

    cudaEventRecord(stop, 0);

    checkCudaErrors(cudaStreamSynchronize(0));

    checkCudaErrors(cudaDeviceSynchronize());

    float gpu_time = 0;
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaHostUnregister(h_in));
    checkCudaErrors(cudaHostUnregister(h_out));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


    // GPU output check
    int sum = 0;
    for(int i = 0; i < dataSize; i++)       
        sum += h_groundTruth[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_groundTruth;

    return 0;
}

Using Nsight for profiling, I have this result:

enter image description here

It may seem correct, but why does the D2H transfer in stream #1 only start when the last kernel launch of stream #2 and not before? I tried also to use 8 streams (just by changing NUM_STREAM to 8) to achieve such a "3-way overlap" and here is the result:

enter image description here

The interesting thing is that when I use 8 streams, the overlappings between computation and memory transfers seem to be much better.

What is the reason for this problem? Is it due to WDDM driver or is there something wrong with my program?

Solution

From the comments above, it seems that the OP's problem is a false dependency issue, suffered by the Fermi architecture and solved by the Hyper-Q feature of the Kepler architecture.

To summarize, the OP is highlighting the fact that the first D2H transfer (stream #1) does not start immediately after the last H2D (stream #3) finishes, while in principle it could. The time gap is highlighted by the red circle in the following figure (henceforth, but for the differently specified, all the tests refer to a GeForce GT540M belonging to the Fermi family):

enter image description here

The OP's approach is a breadth-first approach, which operates according to the following scheme:

for(int i = 0; i < NUM_STREAMS; i++)
    cudaMemcpyAsync(..., cudaMemcpyHostToDevice,   streams[i]);

for(int i = 0; i < NUM_STREAMS; i++)
{
    kernel_launch_1<<<..., 0, streams[i]>>>(...);
    kernel_launch_2<<<..., 0, streams[i]>>>(...);
}

for(int i = 0; i < NUM_STREAMS; i++)
    cudaMemcpyAsync(..., cudaMemcpyDeviceToHost,   streams[i]);

Using a depth-first approach, operating according to the following scheme

for(int i = 0; i < NUM_STREAMS; i++)
{
    cudaMemcpyAsync(...., cudaMemcpyHostToDevice, streams[i]);

    kernel_launch_1<<<...., 0, streams[i]>>>(....);
    kernel_launch_2<<<...., 0, streams[i]>>>(....);

    cudaMemcpyAsync(...., cudaMemcpyDeviceToHost,   streams[i]);
}

does not seem to improve the situation, according to the following timeline (the depth-first code is reported at the bottom of the answer), but it seems to show a worse overlapping:

enter image description here

Under the breadth-first approach, and commenting the second kernel launch, the first D2H copy starts immediately as it can, as reported by the following timeline:

enter image description here

Finally, running the code on a Kepler K20c, the problem does not show up, as illustrated by the following figure:

enter image description here

Here is the code for the depth-first approach:

#include <iostream>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

// includes, project
#include "helper_cuda.h"
#include "helper_functions.h" // helper utility functions

#include <stdio.h>

using namespace std;

#define DATA_SIZE 6000000
#define NUM_THREADS 32
#define NUM_BLOCKS 16
#define NUM_STREAMS 3

__global__ void kernel(const int *in, int *out, int dataSize)
{
    int start = blockIdx.x * blockDim.x + threadIdx.x;
    int end =  dataSize;
    for (int i = start; i < end; i += blockDim.x * gridDim.x)
    {
        out[i] = in[i] * in[i];
    }
}

int main()
{
    const int dataSize = DATA_SIZE;
    int *h_in = new int[dataSize];
    int *h_out = new int[dataSize];
    int *h_groundTruth = new int[dataSize];

    // Input population
    for(int i = 0; i < dataSize; i++)
        h_in[i] = 5;

    for(int i = 0; i < dataSize; i++)
        h_out[i] = 0;

    // CPU calculation for ground truth
    for(int i = 0; i < dataSize; i++)
        h_groundTruth[i] = h_in[i] * h_in[i];

    // Choose which GPU to run on, change this on a multi-GPU system.
    checkCudaErrors( cudaSetDevice(0) );

    int *d_in = 0;
    int *d_out = 0;
    int streamSize = dataSize / NUM_STREAMS;
    size_t memSize = dataSize * sizeof(int);
    size_t streamMemSize = memSize / NUM_STREAMS;

    checkCudaErrors( cudaMalloc( (void **)&d_in, memSize) );
    checkCudaErrors( cudaMalloc( (void **)&d_out, memSize) );

    // registers host memory as page-locked (required for asynch cudaMemcpyAsync)
    checkCudaErrors(cudaHostRegister(h_in, memSize, cudaHostRegisterPortable));
    checkCudaErrors(cudaHostRegister(h_out, memSize, cudaHostRegisterPortable));

    // set kernel launch config
    dim3 nThreads = dim3(NUM_THREADS,1,1);
    dim3 nBlocks = dim3(NUM_BLOCKS,1,1);

    cout << "GPU Kernel Configuration : " << endl;
    cout << "Number of Streams :\t" << NUM_STREAMS << " with size: \t" << streamSize << endl;
    cout << "Number of Threads :\t" << nThreads.x << "\t" << nThreads.y << "\t" << nThreads.z << endl;
    cout << "Number of Blocks :\t" << nBlocks.x << "\t" << nBlocks.y << "\t" << nBlocks.z << endl;

    // create cuda stream
    cudaStream_t streams[NUM_STREAMS];
    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamCreate(&streams[i]));

    // create cuda event handles
    cudaEvent_t start, stop;
    checkCudaErrors(cudaEventCreate(&start));
    checkCudaErrors(cudaEventCreate(&stop));

    cudaEventRecord(start, 0);

    for(int i = 0; i < NUM_STREAMS; i++)
    {
        int offset = i * streamSize;

        cudaMemcpyAsync(&d_in[offset], &h_in[offset], streamMemSize, cudaMemcpyHostToDevice,     streams[i]);

        dim3 subKernelBlock = dim3((int)ceil((float)nBlocks.x / 2));

        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset], &d_out[offset],   streamSize/2);
        kernel<<<subKernelBlock, nThreads, 0, streams[i]>>>(&d_in[offset + streamSize/2],    &d_out[offset +  streamSize/2], streamSize/2);

        cudaMemcpyAsync(&h_out[offset], &d_out[offset], streamMemSize, cudaMemcpyDeviceToHost,   streams[i]);
    }



    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamSynchronize(streams[i]));

    cudaEventRecord(stop, 0);

    checkCudaErrors(cudaStreamSynchronize(0));

    checkCudaErrors(cudaDeviceSynchronize());

    float gpu_time = 0;
    checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));


    // release resources
    checkCudaErrors(cudaEventDestroy(start));
    checkCudaErrors(cudaEventDestroy(stop));
    checkCudaErrors(cudaHostUnregister(h_in));
    checkCudaErrors(cudaHostUnregister(h_out));
    checkCudaErrors(cudaFree(d_in));
    checkCudaErrors(cudaFree(d_out));

    for(int i = 0; i < NUM_STREAMS; i++)
        checkCudaErrors(cudaStreamDestroy(streams[i]));

    cudaDeviceReset();  

    cout << "Execution Time of GPU: " << gpu_time << "ms" << endl;


    // GPU output check
    int sum = 0;
    for(int i = 0; i < dataSize; i++)      
        sum += h_groundTruth[i] - h_out[i];

    cout << "Error between CPU and GPU: " << sum << endl;

    delete[] h_in;
    delete[] h_out;
    delete[] h_groundTruth;

    return 0;
}