I am building a n-body simulation, using CUDA to increase performance. I am currently working on further parallelizing the system so each interaction between particles runs on a seperate thread. This reduces the theoretical complexity to 1 only limited by the speed of the GPU.
In order to accomplish this I am attempting to launch a kernel using an N^2 set of blocks (where N in number of particles) using (N/T, N/T)
grid and T*T
blocks (where T is the number of threads per block). I have been able to launch an N*N grid but whenever I try to use multidimensional blocks (of threads) the kernel crashes with:
error code invalid configuration arguments
This is with T=512 and N=5000 but reducing these to T=128 and N=1000 had no effect. Here are some specs and code:
Cuda SDK Version: 7.5
GPU: GTX 970 4gb
CC Version: 5.2
Compiling in MSVS 2013 64bit in Windows 7
dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;
doParticles<<< grid, block >>>(d_pos, d_vel, d_acc, d_mass, numParticles, dt);
How can I change this code to accomplish my goal?
I can post some kernel code etc. but don't think it matters since the kernel doesn't even launch. Let me know if any other info would be useful.
Thanks in advance.
EDIT:
main
#define TPB 32
....
unsigned int numParticles = 1000;
p_type* h_pos;
p_type* h_vel;
p_type* h_acc;
p_type* h_mass;
p_type* d_pos;
p_type* d_vel;
p_type* d_acc;
p_type* d_mass;
int pointsPerParticleVec = 3;
size_t size = sizeof(p_type) * 3 * numParticles;
h_pos = (p_type*)malloc(size);
h_vel = (p_type*)malloc(size);
h_acc = (p_type*)malloc(size);
h_mass = (p_type*)malloc(size / 3);
d_pos = NULL;
d_vel = NULL;
d_acc = NULL;
cudaError_t err = cudaSuccess;
//allocate space on GPU
err = cudaMalloc((void **)&d_pos, size);
err = cudaMalloc((void **)&d_vel, size);
err = cudaMalloc((void **)&d_acc, size);
err = cudaMalloc((void **)&d_mass, size / 3);
//nothing really matters for this example just making sure no gargage values happen
for (int partIt = 0; partIt < numParticles; partIt++)
{
int index = partIt * 3;
h_pos[index] = 0;
h_pos[index + 1] = 0;
h_pos[index + 2] = 0;
h_vel[index] = 0;
h_vel[index + 1] = 0;
h_vel[index + 2] = 0;
h_acc[index] = 0;
h_acc[index + 1] = 0;
h_acc[index + 2] = 0;
h_mass[partIt] = 0;
}
err = cudaMemcpy(d_pos, h_pos, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_vel, h_vel, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_acc, h_acc, size, cudaMemcpyHostToDevice);
err = cudaMemcpy(d_mass, h_mass, size / 3, cudaMemcpyHostToDevice);
while (true) //display functionality removed for now
{
//do calculations
float dt = .1;
dim3 block(TPB, TPB);
dim3 grid;
grid.x = (numParticles + TPB - 1) / TPB;
grid.y = (numParticles + TPB - 1) / TPB;
doParticles << < grid, block >> >(d_pos, d_vel, d_acc, d_mass, numParticles, dt); //<<<<<<<<<<<<here is where it does not launch
err = cudaGetLastError();
if (err != cudaSuccess)
{
fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err)); //see the error pop up here
exit(EXIT_FAILURE);
}
cudaDeviceSynchronize();
int numBlocks2 = (numParticles * 3 + TPB - 1) / TPB;
//add acceleration to velocity
ARR_ADD << <numBlocks2, TPB >> >(d_vel, d_acc, numParticles * 3);
cudaDeviceSynchronize();
//reset acceleration vector
ARR_SET << <numBlocks2, TPB >> >(d_acc, 0.0f, numParticles * 3);
//add velocity to position
POS_ADD << <numBlocks2, TPB >> >(d_pos, d_vel, numParticles * 3, dt);
//copy vector back to cpu (until opengl-cuda gets implemented)
cudaMemcpy(h_pos, d_pos, sizeof(p_type) * 3 * numParticles, cudaMemcpyDeviceToHost);
}
kernels
__device__ float fInvSqrt_D(const float& in)
{
long i;
float x2, y;
const float threehalfs = 1.5F;
x2 = in * 0.5F;
y = in;
i = *(long *)&y;
i = 0x5f3759df - (i >> 1);
y = *(float *)&i;
y = y * (threehalfs - (x2 * y * y));
y = y * (threehalfs - (x2 * y * y)); //extra precision
return abs(y);
}
__global__ void POS_ADD(p_type* getter, const p_type *giver, int N, float dt)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = getter[index] + (giver[index]*dt);
}
}
__global__ void ARR_ADD(p_type* getter, const p_type *giver, int N)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = getter[index] + giver[index];
}
}
__global__ void ARR_SET(p_type* getter, const p_type value, int N)
{
int index = blockDim.x * blockIdx.x + threadIdx.x;
if (index < N)
{
getter[index] = value;
}
}
__global__ void doParticles(p_type* pos, p_type* vel, p_type* acc, p_type* mass, int numParticles, float tstep)
{
int blockId = blockIdx.y * gridDim.x + blockIdx.x;
int threadIndex = blockId * blockDim.x + threadIdx.x;
int cRowPos = blockId % gridDim.y;
int notInRow = blockId - cRowPos;
int Y = blockId/gridDim.x + threadIdx.y; //slower changing iterator
int X = threadIndex - ((notInRow) * TPB); //fast changing iterator
int pIndex1 = X * 3;
int pIndex2 =Y * 3;
if (pIndex1 != pIndex2 && Y < numParticles)
{
p_type diffx = (pos[pIndex1] - pos[pIndex2]); //calculating difference between points
p_type diffy = (pos[pIndex1 + 1] - pos[pIndex2 + 1]);
p_type diffz = (pos[pIndex1 + 2] - pos[pIndex2 + 2]);
p_type distsqr = diffx*diffx + diffy*diffy + diffz*diffz;
if (distsqr < 0)
{
distsqr *= -1;
}
if (distsqr < 500)
{
distsqr = 500;
}
p_type attraction = (mass[X] * mass[Y]) / (distsqr); //gravity equation
p_type invsqrt = fInvSqrt_D((float)distsqr);
p_type normx = invsqrt*diffx;
p_type normy = invsqrt*diffy;
p_type normz = invsqrt*diffz;
p_type forcex = normx * -attraction;
p_type forcey = normy * -attraction;
p_type forcez = normz * -attraction;
acc[pIndex1] += (forcex * tstep) / mass[X];
acc[pIndex1 + 1] += (forcey * tstep) / mass[X];
acc[pIndex1 + 2] += (forcez * tstep) / mass[X];
}
}
And yes, I know the indexing in the doParticle kernel is broken. I plan on fixing that one it launches. :)
Thanks again.
CUDA thread blocks are limited to a maximum of 1024 threads, The total threads in the block is the product of the threadblock dimensions:
dim3 block(TPB, TPB);
So any value of TPB
larger than 32 here is not going to work, and you will get an invalid configuration argument error when you try to launch any such kernel.
So reduce T
or TPB
to 32 and you should be able to launch your kernel.