difference between elapsed time in matrix multiplication with and without using shared memory in cuda

I am new in CUDA programming. My program is matrix multiplication with and without shared memory. I use the codes in the Cuda_C_Programming_Guide ebook. In the book we see the program which uses shared memory has less elapsed time than the elapsed time in the non-shared program. But when I run the programs it is the other way around. Does anyone know why? Or am I wrong?

non-shared memory :

#include <stdio.h>
#include <stdlib.h>
#include <conio.h>
#include <iostream>
#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>

typedef struct _Matrix
{
    int height;//number of rows
    int width;//number of columns
    float *elements;
}Matrix;


#define BLOCK_SIZE 20
__global__ void add_matrix_kernel(const Matrix a,const Matrix b,Matrix c)
{
    int N=a.width;
    int row=blockIdx.y * blockDim.y + threadIdx.y;
    int col=blockIdx.x * blockDim.x+threadIdx.x;
    c.elements[row * N + col]=a.elements[row * N + col]+b.elements[row * N + col];

}

__global__ void simpleMultiply(const Matrix a,const Matrix b, Matrix c)
{ 
    int N=a.width;
    int TILE_DIM=a.width;
    int row = blockIdx.y * blockDim.y + threadIdx.y; 
    int col = blockIdx.x * blockDim.x + threadIdx.x; 
    int sum = 0; 
    for (int i = 0; i < TILE_DIM; i++) 
    { 
        sum += a.elements[row*TILE_DIM+i] * b.elements[i*N+col]; 
    } 
    c.elements[row*N+col] = sum; 
}

void add_matrix(const Matrix A,const Matrix B,Matrix C)
{

    // Load A and B to device memory
    Matrix d_A;
    Matrix d_B;
    Matrix d_C;

    d_A.width = A.width; d_A.height = A.height;
    d_B.width = B.width; d_B.height = B.height;
    d_C.width = C.width; d_C.height = C.height;



    size_t sizeA = A.width * A.height * sizeof(float);
    size_t sizeB = B.width * B.height * sizeof(float);
    size_t sizeC = C.width * C.height * sizeof(float);
    //allocate space for device copies of A,B,C 
    cudaMalloc((void **)&d_A.elements, sizeA);
    //gpuErrchk( cudaMalloc(&a_d, size*sizeof(int)) );
    cudaMalloc((void **)&d_B.elements, sizeB);
    cudaMalloc((void **)&d_C.elements, sizeC);
    //copy inputs to device
    cudaMemcpy(d_A.elements, A.elements, sizeA,cudaMemcpyHostToDevice);     
    cudaMemcpy(d_B.elements, B.elements, sizeA,cudaMemcpyHostToDevice);


    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    dim3 dimGrid(B.width/dimBlock.x, A.height/dimBlock.y);


    //add_matrix_kernel<<<grid_size,block_size>>>(d_A, d_B, d_C);

    simpleMultiply<<<dimGrid,dimBlock>>>(d_A,d_B,d_C);

    // Read C from device memory
    cudaMemcpy(C.elements, d_C.elements, sizeA,cudaMemcpyDeviceToHost);
    // Free device memory
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);

}

//void print_matrix(int *c,int row,int col)
//{
//  for (int i = 0; i < row; ++i){
//      for (int j = 0; j < col; ++j)
//          printf("%d ",c[col*i+j]);
//      printf("\n\n");
//  }
//}

void print_matrix(Matrix A){
    printf("Matrix:\n");
    int i;
    int rows=0;
    //printf("row %d\n",rows);
    for(i=0; i<A.width*A.height; i++){
        if(i%A.width==0){ printf("\n");printf("row %d\n",rows);rows++;}
        printf("%6.4f\t",A.elements[i]);
    }
    printf("\n");

}

void throw_on_cuda_error(cudaError_t code, const char *file, int line)
{
    if(code != cudaSuccess)
    {
        std::stringstream ss;
        ss << file << "(" << line << ")";
        std::string file_and_line;
        ss >> file_and_line;
        throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
    }
}

int main()
{
    cudaEvent_t start,stop;
    try{
        int i,j;
        Matrix A,B;
        Matrix C;

        A.width=1200;
        A.height=1200;
        B.width=1200;
        B.height=1200;
        C.width=B.width;
        C.height=A.height;
        size_t sizeA = A.width * A.height * sizeof(float);
        A.elements = (float *)malloc(sizeA);
        //random_init(A.elements,A.width * A.height );
        size_t sizeB = B.width * B.height * sizeof(float);
        B.elements= (float *)malloc(sizeB);
        //random_init(B.elements,B.width * B.height);
        size_t sizeC = C.width * C.height * sizeof(float);
        C.elements= (float *)malloc(sizeC);
        for(i=0;i<A.width*A.height;i++)
            A.elements[i]=1;

        for(int i=0;i<B.width*B.height;i++)
            B.elements[i]=1;
        printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix C(%d,%d)\n",A.height,A.width,B.height,B.width,C.height,C.width);
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start,0);

        add_matrix(A,B,C);
        cudaPeekAtLastError() ;
        cudaDeviceSynchronize() ;
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);
        float elapsedTime;
        cudaEventElapsedTime(&elapsedTime,start,stop);

        printf("Time to genreat : %3.5f ms\n",elapsedTime);
        cudaEventDestroy(start);
        cudaEventDestroy(stop);

        /*printf("\nA\n");
        print_matrix(A.elements,A.height,A.width);
        printf("\nB\n");
        print_matrix(B.elements,B.height,B.width);*/
        printf("\nC\n");
        //      print_matrix(C.elements,C.height,C.width);
        //  print_matrix(C);
        printf("C[%d] = %f\n",0,C.elements[0]);
        printf("C[%d] = %f\n",(C.width)-1,C.elements[(C.width)-1]);
        printf("C[%d] = %f\n",(C.width)*(C.height)-1,C.elements[(C.width)*(C.height)-1]);
        free(A.elements);
        free(B.elements);
        free(C.elements);
        getchar();
        throw_on_cuda_error(cudaSetDevice(-1), __FILE__, __LINE__);
    }
    catch(thrust::system_error &e)
    {
        std::cerr << "CUDA error after cudaSetDevice: " << e.what() << std::endl;

        // oops, recover
        cudaSetDevice(0);
    }
    return 0;

}

Using shared memory:

// Matrices are stored in row-major order:
// M(row, col) = *(M.elements + row * M.stride + col)

#include <stdio.h>
#include <iostream>
#include <thrust/system_error.h>
#include <thrust/system/cuda_error.h>
#include <sstream>

#define BLOCK_SIZE 20

typedef struct {
    int width;
    int height;
    int stride; 
    float* elements;
} Matrix;

// Get a matrix element
__device__ float GetElement(const Matrix A, int row, int col)
{
    return A.elements[row * A.stride + col];
}

// Set a matrix element
__device__ void SetElement(Matrix A, int row, int col,
    float value)
{
    A.elements[row * A.stride + col] = value;
}

// Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is
// located col sub-matrices to the right and row sub-matrices down
// from the upper-left corner of A

__device__ Matrix GetSubMatrix(Matrix A, int row, int col)
{
    Matrix Asub;

    Asub.width = BLOCK_SIZE;
    Asub.height = BLOCK_SIZE;
    Asub.stride = A.stride;
    Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row+ BLOCK_SIZE * col];
    return Asub;
}

// Thread block size
// Forward declaration of the matrix multiplication kernel
__global__ void MatMulKernel(const Matrix, const Matrix, Matrix);

// Matrix multiplication - Host code
// Matrix dimensions are assumed to be multiples of BLOCK_SIZE
void MatMul(const Matrix A, const Matrix B, Matrix C)
{
    // Load A and B to device memory
    Matrix d_A;
    d_A.width = d_A.stride = A.width; d_A.height = A.height;
    size_t size = A.width * A.height * sizeof(float);
    cudaMalloc(&d_A.elements, size);
    cudaMemcpy(d_A.elements, A.elements, size,
        cudaMemcpyHostToDevice);
    Matrix d_B;
    d_B.width = d_B.stride = B.width; d_B.height = B.height;
    size = B.width * B.height * sizeof(float);
    cudaMalloc(&d_B.elements, size);
    cudaMemcpy(d_B.elements, B.elements, size,
        cudaMemcpyHostToDevice);
    // Allocate C in device memory
    Matrix d_C;
    d_C.width = d_C.stride = C.width; d_C.height = C.height;
    size = C.width * C.height * sizeof(float);
    cudaMalloc(&d_C.elements, size);
    // Invoke kernel
    dim3 dimBlock(BLOCK_SIZE,BLOCK_SIZE);
    //dim3 dimBlock(C.height, C.width);
    dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y);
    //dim3 dimGrid((B.width+dimBlock.x-1) / dimBlock.x, (A.height+dimBlock.y-1) / dimBlock.y);

    MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C);

    // Read C from device memory
    cudaMemcpy(C.elements, d_C.elements, size,
        cudaMemcpyDeviceToHost);
    // Free device memory
    cudaFree(d_A.elements);
    cudaFree(d_B.elements);
    cudaFree(d_C.elements);
}

// Matrix multiplication kernel called by MatMul()
__global__ void MatMulKernel(Matrix A, Matrix B, Matrix C)
{
    // Block row and column
    int blockRow = blockIdx.y;
    int blockCol = blockIdx.x;
    // Each thread block computes one sub-matrix Csub of C
    Matrix Csub = GetSubMatrix(C, blockRow, blockCol);
    // Each thread computes one element of Csub
    // by accumulating results into Cvalue
    float Cvalue = 0;
    // Thread row and column within Csub
    int row = threadIdx.y;
    int col = threadIdx.x;
    // Loop over all the sub-matrices of A and B that are
    // required to compute Csub
    // Multiply each pair of sub-matrices together
    // and accumulate the results
    for (int m = 0; m < (A.width / BLOCK_SIZE); ++m) {
        // Get sub-matrix Asub of A
        Matrix Asub = GetSubMatrix(A, blockRow, m);
        // Get sub-matrix Bsub of B
        Matrix Bsub = GetSubMatrix(B, m, blockCol);
        // Shared memory used to store Asub and Bsub respectively
        __shared__ float As[BLOCK_SIZE][BLOCK_SIZE];
        __shared__ float Bs[BLOCK_SIZE][BLOCK_SIZE];
        // Load Asub and Bsub from device memory to shared memory
        // Each thread loads one element of each sub-matrix
        As[row][col] = GetElement(Asub, row, col);
        Bs[row][col] = GetElement(Bsub, row, col);
        // Synchronize to make sure the sub-matrices are loaded
        // before starting the computation
        __syncthreads();
        // Multiply Asub and Bsub together
        for (int e = 0; e < BLOCK_SIZE; ++e)
            Cvalue += As[row][e] * Bs[e][col];
        // Synchronize to make sure that the preceding
        // computation is done before loading two new
        // sub-matrices of A and B in the next iteration
        __syncthreads();
    }
    // Write Csub to device memory
    // Each thread writes one element
    SetElement(Csub, row, col, Cvalue);
}


//////////////////////////////////////////////////////////
/// print_matrix function ///////////////////////////
////////////////////////////////////////////////////////
void print_matrix(float *c,int row,int col){
    for (int i = 0; i < row; ++i){
        for (int j = 0; j < col; ++j)
            printf("%f ",c[col*i +j]);
        printf("\n\n");
    }
}

//////////////////////////////////////////////////////////
/// random_init function ///////////////////////////
////////////////////////////////////////////////////////
void random_init(float *a,int size){
    for(int i=0;i<size;i++)
        a[i]=rand()%10;
}
////////////////////////////////////////////////////////

void throw_on_cuda_error(cudaError_t code, const char *file, int line)
{
    if(code != cudaSuccess)
    {
        std::stringstream ss;
        ss << file << "(" << line << ")";
        std::string file_and_line;
        ss >> file_and_line;
        throw thrust::system_error(code, thrust::cuda_category(), file_and_line);
    }
}

int main(void){
    cudaEvent_t start,stop;
    try
    {
        Matrix A,B,C;
        A.width=1200;
        A.height=1200;/////
        B.width=1200;/////
        B.height=1200;
        C.width=B.width;
        C.height=A.height;

        size_t size = A.width * A.height * sizeof(float);
        A.elements = (float *)malloc(size);
        //random_init(A.elements,A.width * A.height );
        size = B.width * B.height * sizeof(float);
        B.elements= (float *)malloc(size);
        //random_init(B.elements,B.width * B.height);
        size = C.width * C.height * sizeof(float);
        C.elements= (float *)malloc(size);
        for(int i=0;i<A.width*A.height;i++)
            A.elements[i]=1;
        for(int i=0;i<B.width*B.height;i++)
            B.elements[i]=1;
        printf("matrix A(%d,%d) & matrix B(%d,%d) & matrix C(%d,%d)\n",A.width,A.height,B.width,
               B.height,C.width,C.height);
        //////////////////////////////////////////////////////\|/
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
        cudaEventRecord(start,0);
        MatMul(A,B,C);
        cudaPeekAtLastError() ;
        cudaDeviceSynchronize() ;
        cudaEventRecord(stop,0);
        cudaEventSynchronize(stop);
        float elapsedTime;
        cudaEventElapsedTime(&elapsedTime,start,stop);
        printf("Time to genreat : %4.5f ms\n",elapsedTime);
        //////////////////////////////////////////////////////\|/
        printf("%s\n", cudaGetErrorString(cudaGetLastError()));
        //printf("\nA\n");
        //print_matrix(A.elements,A.height,A.width);
        //printf("\nB\n");
        //print_matrix(B.elements,B.height,B.width);
        printf("\nC\n");
        //print_matrix(C.elements,C.height,C.width);


        printf("C[%d]=%f\n",0,C.elements[0]);
        printf("C[%d]=%f\n",C.width -1,C.elements[C.width-1]);
        printf("C[%d]=%f\n",(C.width * C.height)-1,C.elements[(C.width * C.height)-1]);

        getchar();
        throw_on_cuda_error(cudaSetDevice(-1), __FILE__, __LINE__);
    }
    catch(thrust::system_error &e)
    {
        std::cerr << "CUDA error after cudaSetDevice: " << e.what() << std::endl;

        // oops, recover
        cudaSetDevice(0);
    }
    return(0);
}

The debug build gives the following output:

'GPU_Matrix.exe': Loaded 'E:\FarnAz\Cuda Project\Projects\GPU_Matrix\Debug\GPU_Matrix.exe', Symbols loaded.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\ntdll.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\kernel32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\KernelBase.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\bin\cudart32_42_9.dll', Binary was not built with debug information.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msvcp100d.dll', Symbols loaded.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msvcr100d.dll', Symbols loaded.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nvcuda.dll', Binary was not built with debug information.
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\user32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\gdi32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\lpk.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\usp10.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msvcrt.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\advapi32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\sechost.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\rpcrt4.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\sspicli.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\cryptbase.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\setupapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\cfgmgr32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\oleaut32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\ole32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\devobj.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\shell32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\shlwapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\ws2_32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nsi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\imm32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msctf.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\ProgramData\Wincert\win32cert.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nvinit.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files (x86)\NVIDIA Corporation\coprocmanager\detoured.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files (x86)\NVIDIA Corporation\coprocmanager\Nvd3d9wrap.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Program Files (x86)\NVIDIA Corporation\coprocmanager\nvdxgiwrap.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Unloaded 'C:\ProgramData\Wincert\win32cert.dll'
The thread 'Win32 Thread' (0x1214) has exited with code 1849301074 (0x6e3a1852).
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\dwmapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Unloaded 'C:\Windows\SysWOW64\dwmapi.dll'
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\nvapi.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\version.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\wintrust.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\crypt32.dll', Cannot find or open the PDB file
'GPU_Matrix.exe': Loaded 'C:\Windows\SysWOW64\msasn1.dll', Cannot find or open the PDB file

E.g. the result for matrices 1000*1000 is about 1219 ms for the non-shared code and about 1770 ms for the shared memory code.

When I build in release mode, the program doesn't build successfully and shows some errors in error list. But I don't know why! The output in release mode is:

1>------ Build started: Project: GPU_Matrix, Configuration: Release Win32 ------
1>Build started 11/13/2013 10:39:47 AM.
1>InitializeBuildStatus:
1>  Touching "Release\GPU_Matrix.unsuccessfulbuild".
1>AddCudaCompilePropsDeps:
1>Skipping target "AddCudaCompilePropsDeps" because all output files are up-to-date with respect to the input files.
1>CudaBuild:
1>  Compiling CUDA source file main.cu...
1>  
1>  E:\FarnAz\Cuda Project\Projects\GPU_Matrix\GPU_Matrix>"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\bin\nvcc.exe" -gencode=arch=compute_10,code=\"sm_10,compute_10\" --use-local-env --cl-version 2010 -ccbin "C:\Program Files (x86)\Microsoft Visual Studio 10.0\VC\bin"  -I"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v4.2\include"    --keep-dir "Release" -maxrregcount=0  --machine 32 --compile      -Xcompiler "/EHsc /nologo /Od /Zi  /MD  " -o "Release\main.cu.obj" "E:\FarnAz\Cuda Project\Projects\GPU_Matrix\GPU_Matrix\main.cu" 
1>  main.cu
1>  tmpxft_00001c70_00000000-0_main.cudafe1.gpu
1>  tmpxft_00001c70_00000000-5_main.cudafe2.gpu
1>  main.cu
1>  tmpxft_00001c70_00000000-0_main.cudafe1.cpp
1>  tmpxft_00001c70_00000000-11_main.ii
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaFree@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaConfigureCall@32
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaMemcpy@16
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaMalloc@8
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaGetErrorString@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaSetDevice@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventDestroy@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventElapsedTime@12
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventSynchronize@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaDeviceSynchronize@0
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaPeekAtLastError@0
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventRecord@8
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaEventCreate@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaSetupArgument@12
1>main.cu.obj : error LNK2001: unresolved external symbol ___cudaRegisterFunction@40
1>main.cu.obj : error LNK2001: unresolved external symbol ___cudaRegisterFatBinary@4
1>main.cu.obj : error LNK2001: unresolved external symbol ___cudaUnregisterFatBinary@4
1>main.cu.obj : error LNK2001: unresolved external symbol _cudaLaunch@4
1>E:\FarnAz\Cuda Project\Projects\GPU_Matrix\Release\GPU_Matrix.exe : fatal error LNK1120: 18 unresolved externals
1>
1>Build FAILED.
1>
1>Time Elapsed 00:00:08.43
========== Build: 0 succeeded, 1 failed, 0 up-to-date, 0 skipped ==========

I ran vectorAdd in both modes. Then I pasted my code in that project. In debug mode it has no problem and the result in non-shared is about 1372 ms and in shared memory is about 1842 ms. But in release mode it shows a new window that tells :

Debugging information for ‘vectorAdd.exe’ cannot be found or does not match. Binary was not built with debug information. Do you want to continue debugging ?

When I click yes it continues and runs with no error. And the result in non-shared is about 645 ms and in shared-memory is about 183 ms. I don’t understand why the results are vice versa in release mode and which one is true? Is the result in release mode true for every project or in debug mode?

Solution

You're getting this message:

“Debugging information for ‘vectorAdd.exe’ cannot be found or does not match. Binary was not built with debug information. Do you want to continue debugging ?” ,

Because of the way you are starting the executable in Visual Studio. When you build the release project you should just run it, rather than "start debugging". You'll need to explore visual studio a bit more.

The results you get seem correct in Release mode. The shared memory code runs faster, as expected. When building a "Debug" project in visual studio, the -G switch will normally get passed to the nvcc compiler driver which has a significant affect on code generation. It does more than just allow for debugging by adding symbols. It disables many optimizations that the compiler might make, so as to make source debugging easier.

You should not evaluate CUDA code performance in "Debug" mode or by passing the -G switch to nvcc.