Invalid configuration argument for thread block greater than 16bit

This code works fine:

#include <stdio.h>
#define N 1000 // <-- Works for values < 2^16

__global__
void add(int *a, int *b) {
    int i = blockIdx.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
int main() {
    int max_value[2];
    int ha[N], hb[N];
    int *da, *db;
    cudaMalloc((void **)&da, N*sizeof(int));
    cudaMalloc((void **)&db, N*sizeof(int));
    for (int i = 0; i<N; ++i) {
        ha[i] = i;
    }
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice);
    add<<<N, 1>>>(da, db);
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost);
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    cudaFree(da);
    cudaFree(db);
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]);
    getchar();
    return 0;
}

But when I change the number N (items in the array) from 1000 to >(2¹⁶)-1 the program crashes.

I thought it was an overflow on host, so I moved the array declaration of ha and hb to BSS segment and changed N to 1 million.

#include <stdio.h>
#define N 1000000 // <----

__global__
void add(int *a, int *b) {
    int i = blockIdx.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
static int ha[N]; // <----
static int hb[N]; // <----
int main() {
    int max_value[2];
    // int ha[N], hb[N];
    int *da, *db;
    cudaMalloc((void **)&da, N*sizeof(int));
    cudaMalloc((void **)&db, N*sizeof(int));
    for (int i = 0; i<N; ++i) {
        ha[i] = i;
    }
    cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice);
    add<<<N, 1>>>(da, db);
    cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost);
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    cudaFree(da);
    cudaFree(db);
    printf("Max number %d, from value:%d \n", max_value[0], max_value[1]);
    getchar();
    return 0;
}

Now I don't get an error but the hb array is empty.
Whats wrong with my code?
How can I allocate big arrays to device and get a valid result?

UPDATE: I've inserted the code for error checking,
the error I'm getting is -> "Invalid configuration argument".
The updated code is:

#include <stdio.h>
#include <time.h>
#include <math.h>
#include <thrust/system_error.h>
#include <thrust/system/cuda/error.h>
#include <sstream>
const int N = 70000;

#define checkCudaErrors(error) {\
    if (error != cudaSuccess) {\
        printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
        exit(1);\
        }\
}\

__global__
void add(int *a, int *b) {
    int i = blockIdx.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
static int ha[N];
static int hb[N];
int main() {
    // int ha[N], hb[N];
    int max_value[2];

    int deviceCount = 0;
    cudaGetDeviceCount(&deviceCount);
    cudaError_t err=cudaDeviceReset();
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
    printf("Device count: %d \n", deviceCount);

    for (int i = 0; i<N; ++i) { ha[i] = i; }
    int *da, *db;
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int)));
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice));
    add<<<N, 1>>>(da, db);  // <--- Invalid configuration error
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost));
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    cudaError_t error = cudaGetLastError();     
    if(error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        getchar();
        exit(-1);
    }
    getchar();
    return 0;
}

The device is a GeForce GTX 470 and I'm compiling using
nvcc -o foo new.cu

Solution

Your device (GTX 470) is a cc2.0 device (compute capability).

The invalid configuration argument error is due to the fact that for cc2.0 devices, the number of blocks for a 1-D grid is limited to 65535. This information is discoverable in the programming guide ("Maximum x-dimension of a grid of thread blocks") or by running the deviceQuery CUDA sample code. So your choice of N here is too large:

add<<<N, 1>>>(da, db);
      ^

The usual workaround for this with a cc2.0 device is to create a grid of threadblocks that is multidimensional, which allows for a much larger number of threadblocks. The kernel launch parameters can actually be dim3 variables which allow specification of multidimensional grids (of threadblocks) or multidimensional threadblocks (of threads).

To do this properly, you will also need to change your kernel code to create a proper globally unique thread ID from the multidimensional variables available to you.

The following worked example gives one possible minimal set of changes to demonstrate the concept, and appears to run correctly for me:

$ cat t363.cu
#include <stdio.h>
#include <time.h>
#include <math.h>
#include <thrust/system_error.h>
#include <thrust/system/cuda/error.h>
#include <sstream>
const int N = 70000;

#define checkCudaErrors(error) {\
    if (error != cudaSuccess) {\
        printf("CUDA Error - %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(error));\
        exit(1);\
        }\
}\

__global__
void add(int *a, int *b) {
    int i = blockIdx.x + blockIdx.y*gridDim.x;
    if (i<N) {
        b[i] = 2*a[i];
    }
}
static int ha[N];
static int hb[N];
int main() {
    int max_value[2];

    int deviceCount = 0;
    cudaGetDeviceCount(&deviceCount);
    cudaError_t err=cudaDeviceReset();
    if(err!=cudaSuccess){printf("%s in %s at line %d\n",cudaGetErrorString(err),__FILE__,__LINE__);}
    printf("Device count: %d \n", deviceCount);

    for (int i = 0; i<N; ++i) { ha[i] = i; }
    int *da, *db;
    checkCudaErrors(cudaMalloc((void **)&da, N*sizeof(int)));
    checkCudaErrors(cudaMalloc((void **)&db, N*sizeof(int)));
    checkCudaErrors(cudaMemcpy(da, ha, N*sizeof(int), cudaMemcpyHostToDevice));
    dim3 mygrid(N/10, 10);
    add<<<mygrid, 1>>>(da, db);
    checkCudaErrors(cudaMemcpy(hb, db, N*sizeof(int), cudaMemcpyDeviceToHost));
    max_value[0] = hb[0];
    int i;
    for (i = 0; i < N; i++) {
        if (hb[i] > max_value[0]) {
            max_value[0] = hb[i];
            max_value[1] = i;
        }
    }
    printf("max_value[0] = %d, max_value[1] = %d\n", max_value[0], max_value[1]);
    cudaError_t error = cudaGetLastError();
    if(error != cudaSuccess) {
        printf("CUDA error: %s\n", cudaGetErrorString(error));
        getchar();
        exit(-1);
    }
    return 0;
}
$ nvcc -arch=sm_20 -o t363 t363.cu
nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).
$ ./t363
Device count: 4
max_value[0] = 139998, max_value[1] = 69999
$

Notes:

If you ran your original code on a cc3.0 or higher device, it should not throw that error. Newer CUDA devices raised the 1D grid limit to 2^31-1. But if you wanted to exceed that number of blocks (around 2B) then you would again have to go to a multidimensional grid.

cc2.0 devices are deprecated in CUDA 8, and support for them is being dropped from the upcoming CUDA 9 release.