Lauching 2d block of threads in Cuda crashes

I am building a n-body simulation, using CUDA to increase performance. I am currently working on further parallelizing the system so each interaction between particles runs on a seperate thread. This reduces the theoretical complexity to 1 only limited by the speed of the GPU.

In order to accomplish this I am attempting to launch a kernel using an N^2 set of blocks (where N in number of particles) using (N/T, N/T) grid and T*T blocks (where T is the number of threads per block). I have been able to launch an N*N grid but whenever I try to use multidimensional blocks (of threads) the kernel crashes with:

error code invalid configuration arguments

This is with T=512 and N=5000 but reducing these to T=128 and N=1000 had no effect. Here are some specs and code:

Cuda SDK Version: 7.5

GPU: GTX 970 4gb

CC Version: 5.2

Compiling in MSVS 2013 64bit in Windows 7

Kernel Launch Code

dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;

doParticles<<< grid, block >>>(d_pos, d_vel, d_acc, d_mass, numParticles, dt);

How can I change this code to accomplish my goal?

I can post some kernel code etc. but don't think it matters since the kernel doesn't even launch. Let me know if any other info would be useful.

Thanks in advance.

EDIT:

MCVE

main

#define TPB 32
....

unsigned int numParticles = 1000;
p_type* h_pos;
p_type* h_vel;
p_type* h_acc;
p_type* h_mass;

p_type* d_pos;
p_type* d_vel;
p_type* d_acc;
p_type* d_mass;


int pointsPerParticleVec = 3;
size_t size = sizeof(p_type) * 3 * numParticles;

h_pos = (p_type*)malloc(size);
h_vel = (p_type*)malloc(size);
h_acc = (p_type*)malloc(size);
h_mass = (p_type*)malloc(size / 3);

d_pos = NULL;
d_vel = NULL;
d_acc = NULL;

cudaError_t err = cudaSuccess;
//allocate space on GPU
err = cudaMalloc((void **)&d_pos, size);
err = cudaMalloc((void **)&d_vel, size);
err = cudaMalloc((void **)&d_acc, size);
err = cudaMalloc((void **)&d_mass, size / 3);

//nothing really matters for this example just making sure no gargage values happen
for (int partIt = 0; partIt < numParticles; partIt++)
{
    int index = partIt * 3;

    h_pos[index] = 0;
    h_pos[index + 1] = 0;
    h_pos[index + 2] = 0;

    h_vel[index] = 0;
    h_vel[index + 1] = 0;
    h_vel[index + 2] = 0;

    h_acc[index] = 0;
    h_acc[index + 1] = 0;
    h_acc[index + 2] = 0;

    h_mass[partIt] = 0;
}

err = cudaMemcpy(d_pos, h_pos, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_vel, h_vel, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_acc, h_acc, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_mass, h_mass, size / 3, cudaMemcpyHostToDevice);

while (true)    //display functionality removed for now
{
    //do calculations
    float dt = .1;
    dim3 block(TPB, TPB);
    dim3 grid;
    grid.x = (numParticles + TPB - 1) / TPB;
    grid.y = (numParticles + TPB - 1) / TPB;

    doParticles << < grid, block >> >(d_pos, d_vel, d_acc, d_mass, numParticles, dt);   //<<<<<<<<<<<<here is where it does not launch

    err = cudaGetLastError();

    if (err != cudaSuccess)
    {
        fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));   //see the error pop up here
        exit(EXIT_FAILURE);
    }

    cudaDeviceSynchronize();

    int numBlocks2 = (numParticles * 3 + TPB - 1) / TPB;

    //add acceleration to velocity
    ARR_ADD << <numBlocks2, TPB >> >(d_vel, d_acc, numParticles * 3);

    cudaDeviceSynchronize();
    //reset acceleration vector 
    ARR_SET << <numBlocks2, TPB >> >(d_acc, 0.0f, numParticles * 3);

    //add velocity to position
    POS_ADD << <numBlocks2, TPB >> >(d_pos, d_vel, numParticles * 3, dt);

    //copy vector back to cpu (until opengl-cuda gets implemented)
    cudaMemcpy(h_pos, d_pos, sizeof(p_type) * 3 * numParticles, cudaMemcpyDeviceToHost);
}

kernels

__device__ float fInvSqrt_D(const float& in)
{
    long i;
    float x2, y;
    const float threehalfs = 1.5F;

    x2 = in * 0.5F;
    y = in;
    i = *(long *)&y;
    i = 0x5f3759df - (i >> 1);
    y = *(float *)&i;
    y = y * (threehalfs - (x2 * y * y));
    y = y * (threehalfs - (x2 * y * y));    //extra precision
    return abs(y);
}

__global__ void POS_ADD(p_type* getter, const p_type *giver, int N, float dt)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N)
    {
        getter[index] = getter[index] + (giver[index]*dt);
    }

}

__global__ void ARR_ADD(p_type* getter, const p_type *giver, int N)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N)
    {
        getter[index] = getter[index] + giver[index];
    }

}

__global__ void ARR_SET(p_type* getter, const p_type value, int N)
{
    int index = blockDim.x * blockIdx.x + threadIdx.x;
    if (index < N)
    {
        getter[index] = value;
    }
}



__global__ void doParticles(p_type* pos, p_type* vel, p_type* acc, p_type* mass, int numParticles, float tstep)
{
    int blockId = blockIdx.y * gridDim.x + blockIdx.x;
    int threadIndex = blockId * blockDim.x + threadIdx.x;

    int cRowPos = blockId % gridDim.y;
    int notInRow = blockId - cRowPos;

    int Y = blockId/gridDim.x + threadIdx.y;    //slower changing iterator
    int X = threadIndex - ((notInRow) * TPB);   //fast changing iterator

    int pIndex1 = X * 3;
    int pIndex2 =Y * 3;

    if (pIndex1 != pIndex2 && Y < numParticles)
    {

        p_type diffx = (pos[pIndex1] - pos[pIndex2]);           //calculating difference between points
        p_type diffy = (pos[pIndex1 + 1] - pos[pIndex2 + 1]);
        p_type diffz = (pos[pIndex1 + 2] - pos[pIndex2 + 2]);

        p_type distsqr = diffx*diffx + diffy*diffy + diffz*diffz;

        if (distsqr < 0)
        {
            distsqr *= -1;
        }
        if (distsqr < 500)
        {
            distsqr = 500;
        }

        p_type attraction = (mass[X] * mass[Y]) / (distsqr);    //gravity equation


        p_type invsqrt = fInvSqrt_D((float)distsqr);
        p_type normx = invsqrt*diffx;
        p_type normy = invsqrt*diffy;
        p_type normz = invsqrt*diffz;

        p_type forcex = normx * -attraction;
        p_type forcey = normy * -attraction;
        p_type forcez = normz * -attraction;

        acc[pIndex1] += (forcex * tstep) / mass[X];
        acc[pIndex1 + 1] += (forcey * tstep) / mass[X];
        acc[pIndex1 + 2] += (forcez * tstep) / mass[X];


    }
}

And yes, I know the indexing in the doParticle kernel is broken. I plan on fixing that one it launches. :)

Thanks again.

Solution

CUDA thread blocks are limited to a maximum of 1024 threads, The total threads in the block is the product of the threadblock dimensions:

dim3 block(TPB, TPB);

So any value of TPB larger than 32 here is not going to work, and you will get an invalid configuration argument error when you try to launch any such kernel.

So reduce T or TPB to 32 and you should be able to launch your kernel.