CUDA nbody tile calculation error code 77 when increasing array size

I'm having trouble solving this issue in a CUDA code.

I'm basically calculating the nbody problem from gems3 with an increasing array size.

Particles are created in __global__ void ParticleAmplification() kernel in a specific array,dev_Ionisation, and added in the global array by a host function DynamicAlloc().

In this one, empty positions are removed and new ones are put at the end of the new array. As I'm throwing more threads than available particles, I have an escape variable to avoid losing time checking if there is a particle or not.

The number of blocks and tiles are dynamically allocated and the device arrays are reallocated by doing:

checkCudaErrors( cudaFree( dev_vector ) );
checkCudaErrors( cudaMalloc( (void**)&dev_vector, N * sizeof(ParticleProperties) ) );

Then after some steps, usually when the number of particles has increased up to around 28000, the kernel craches. It gives me the error code 77 which seems to be attributed (cudaDeviceSynchronize() error code 77: cudaErrorIllegalAddress) to a wrong size of the extern shared variable size extern __shared__ float3 sharedPos[] in the __device__ float3 computeBodyAccel function. However, that one seems to be passed correctly to the kernel with always the same size:

size_t sharedMemSize = ThreadsInit * sizeof(float3);

integrateBodies<<<blocksInit, ThreadsInit, sharedMemSize>>>( dev_vectorIonisation, dt, numTiles, nbodyTemp );

When using a fixed although large array, everything goes fine.

What am I doing wrong? Is there a point when the shared memory gets full by a somewhat not freed memory?

Here is the entire compilable code:

// ----- Libraries C/C++ ----- //

#include <stdio.h>
#include <stdlib.h>
#include <iostream>
#include <iomanip>
#include <string>
#include <math.h>
#include <time.h>
#include <dirent.h>

// ----- Libraries CUDA ----- //

#include "cuda.h"
#include <helper_cuda.h>

#include "curand_kernel.h"

// ----- Global variables ----- //

#define El_DIM 512

#define imin(a,b) (a<b?a:b)

using namespace std;

__constant__ float softening_ = 1.0e-12;    // softening factor for nbody interaction

__device__ __managed__ int NewParticles = 0;
__device__ __managed__ int TotalProcesses = 0;
__device__ __managed__ bool Ampl = false;

const int ThreadsInit = 512;
const int blocksPerGrid = (int)( ( El_DIM * El_DIM + ThreadsInit -1 ) / ThreadsInit );


struct ParticleProperties{
    float3 Position, Velocity, Force;
};


__device__ void initVector( ParticleProperties *dev_Vect, int index ){

    dev_Vect[index].Position.x = -1.0;
    dev_Vect[index].Position.y = -1.0;
    dev_Vect[index].Position.z = -1.0;

    dev_Vect[index].Velocity.x = 0.0;
    dev_Vect[index].Velocity.y = 0.0;
    dev_Vect[index].Velocity.z = 0.0;

    dev_Vect[index].Force.x = 0.0;
    dev_Vect[index].Force.y = 0.0;
    dev_Vect[index].Force.z = 0.0;
}

__device__ void SetVector( ParticleProperties *dev_Vect, float3 position, float4 v, int index ){

    dev_Vect[index].Position.x = position.x;
    dev_Vect[index].Position.y = position.y;
    dev_Vect[index].Position.z = position.z;

    dev_Vect[index].Velocity.x = v.x;
    dev_Vect[index].Velocity.y = v.y;
    dev_Vect[index].Velocity.z = v.z;

    dev_Vect[index].Force.x = 0.0;
    dev_Vect[index].Force.y = 0.0;
    dev_Vect[index].Force.z = 0.0;  
}


__device__ float3 bodyBodyInteraction( float3 fi, float3 bi, float3 bj ){

    float3 r;

    // r_ij  [4 FLOPS]
    r.x = ( bj.x - bi.x );
    r.y = ( bj.y - bi.y );
    r.z = ( bj.z - bi.z );
    r.z = 0.0;

    // distSqr = dot(r_ij, r_ij) + EPS^2  [7 FLOPS]
    float distSqr = r.x * r.x + ( r.y * r.y + ( r.z * r.z + softening_ * softening_ ) );

    // invDistCube =1/distSqr^(3/2)  [4 FLOPS (2 mul, 1 sqrt, 1 inv)]
    float invDist = rsqrt(distSqr);
    float invDistCube =  invDist * invDist * invDist;

    // s = m_j * invDistCube [2 FLOP]
    float s = invDistCube;
    // a_i =  a_i + s * r_ij [6 FLOPS]
    fi.x += r.x * s;
    fi.y += r.y * s;
    fi.z += r.z * s;

    return fi;
}


__device__ float3 computeBodyAccel( float3 force, float3 bodyPos, ParticleProperties * positions, const int numTiles, const int nbody ){

    extern __shared__ float3 sharedPos[];

    int computedNbody = 0;

    for( int tile = 0; tile < numTiles; tile++ ){

        sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;

        __syncthreads();

        // This is the "tile_calculation" from the GPUG3 article.

#pragma unroll 128

        for( unsigned int counter = 0; counter < blockDim.x; counter++ ){
            force = bodyBodyInteraction(force, bodyPos, sharedPos[counter]);
            computedNbody++;
            if( computedNbody == nbody ) break;
        }
        __syncthreads();
    }

    return force;
}

__global__ void integrateBodies( ParticleProperties * __restrict__ dev_vector, float deltaTime, int numTiles, int nbody ){

    int index = blockIdx.x * blockDim.x + threadIdx.x;

    float3 position = {0.0, 0.0, 0.0};
    float3 force = {0.0, 0.0, 0.0};

    if( index < nbody ){
        position = dev_vector[index].Position;

        force = computeBodyAccel( force, position, dev_vector, numTiles, nbody );

        // store new force
        dev_vector[index].Position = position;

        dev_vector[index].Force = force;
    }
}


__global__ void IntegrationKernel( ParticleProperties * __restrict__ dev_vector, const float deltaT, const int nbody ){

    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    float3 dvel;
    float3 velocity;

    if( tid < nbody ){
        // integrate
        dvel.x = dev_vector[tid].Force.x * deltaT * 0.5;
        dvel.y = dev_vector[tid].Force.y * deltaT * 0.5;
        dvel.z = dev_vector[tid].Force.z * deltaT * 0.5;

        velocity.x = dev_vector[tid].Velocity.x + dvel.x;
        velocity.y = dev_vector[tid].Velocity.y + dvel.y;
        velocity.z = dev_vector[tid].Velocity.z + dvel.z;

        dev_vector[tid].Position.x += velocity.x * deltaT;
        dev_vector[tid].Position.y += velocity.y * deltaT;
        dev_vector[tid].Position.z += velocity.z * deltaT;

        dev_vector[tid].Velocity.x = velocity.x + dvel.x;
        dev_vector[tid].Velocity.y = velocity.y + dvel.y;
        dev_vector[tid].Velocity.z = velocity.z + dvel.z;
    }
}


__global__ void ParticleAmplification( curandState *state, ParticleProperties * __restrict__ dev_vectorIonisation, 
                                        ParticleProperties * __restrict__ dev_Ionisation, 
                                        const float dt, int numbodies ){

    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int LocalProcesses = 0;

    float3 position = {0.0, 0.0, 0.0};

    float4 v_new = {0.0, 0.0, 0.0, 0.0};

    float prob = 0.0;

    if( TotalProcesses >= El_DIM * El_DIM - 1 ) Ampl = false;


    if( tid < numbodies ){
        position.x = dev_vectorIonisation[tid].Position.x;
        position.y = dev_vectorIonisation[tid].Position.y;
        position.z = dev_vectorIonisation[tid].Position.z;

        prob = curand_uniform( &state[tid] );

        if( Ampl ){
            if( prob < 1.e-3 ){
                atomicAdd( &TotalProcesses, 1 );
                LocalProcesses = atomicAdd( &NewParticles, 1 );

                v_new.x = 0.0;
                v_new.y = 0.0;
                v_new.z = 0.0;          

                SetVector( dev_Ionisation, position, v_new, LocalProcesses );
            }
        }
    }
}


__global__ void initCurand( curandState *state, unsigned long seed ){
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    curand_init(seed, tid, 0, &state[tid]);
}


__global__ void initProcessIoni( ParticleProperties *dev_Vect ){
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    initVector( dev_Vect, x );
}


__global__ void Enumerate_Nbody( ParticleProperties *dev_Vect, int *N, int PrevNbody ){
    int tid = threadIdx.x + blockIdx.x * blockDim.x;

    int gid = blockIdx.x;

    extern __shared__ int cache[];

    if( tid == 0 ) 
        *N = 0;

    if( threadIdx.x == 0 )  
        cache[gid] = 0;

    __syncthreads();

    while( tid < PrevNbody ){
        if( dev_Vect[tid].Position.x > -1.0 )
            atomicAdd( &(cache[gid]), 1 );
        tid += blockDim.x * gridDim.x;
    }

    __syncthreads();

    if( threadIdx.x == 0 )
        atomicAdd( N, cache[gid] );
}



void DynamicAlloc( ParticleProperties **DynamicVector, const ParticleProperties *StaticVector, const int n, int nbody, const int max ){

    ParticleProperties *h_vectorIonisation = new ParticleProperties [nbody];

    ParticleProperties *VectTemporary = new ParticleProperties [n]; 


    checkCudaErrors( cudaMemcpy( VectTemporary, *DynamicVector, n * sizeof(ParticleProperties), cudaMemcpyDeviceToHost ) );

    checkCudaErrors( cudaFree( *DynamicVector ) );

    int i = 0;
    int j = 0;

        for( i = 0; i < n; i++ ){
            if( VectTemporary[i].Position.x > -1.0 ){
                h_vectorIonisation[j] = VectTemporary[i];
                j++;
            }
        }

    delete [] VectTemporary;

    if( NewParticles != 0 ){
        ParticleProperties *StaticVectTemporary = new ParticleProperties [max]; 
        checkCudaErrors( cudaMemcpy( StaticVectTemporary, StaticVector, max * sizeof(ParticleProperties), cudaMemcpyDeviceToHost ) );
        int k = 0;

#pragma unroll 32       
        for( i = 0; i < max; i++ ){
            if( StaticVectTemporary[i].Position.x > -1.0 ){
                h_vectorIonisation[j + k] = StaticVectTemporary[i];
                k++;
            }
        }

        delete [] StaticVectTemporary;
    }

    if( nbody > 0 ){
        checkCudaErrors( cudaMalloc( (void**)DynamicVector, nbody * sizeof(ParticleProperties) ) );

        checkCudaErrors( cudaMemcpy( *DynamicVector, h_vectorIonisation, nbody * sizeof(ParticleProperties), cudaMemcpyHostToDevice ) );
    }

    delete [] h_vectorIonisation;
}


int main( int argc_, char **argv_ ){    

    cudaDeviceReset();  

    cudaDeviceProp prop;

    checkCudaErrors( cudaGetDeviceProperties( &prop, 0 ) );

    int Newers = 256;

    int nbody = 1;

    Ampl = true;

    int *dev_nbody;
    checkCudaErrors( cudaMalloc( (void**)&dev_nbody, sizeof(int) ) );
    checkCudaErrors( cudaMemcpy( dev_nbody, &nbody, sizeof(int), cudaMemcpyHostToDevice ) );

    float dt = 0.5e-13;

    float3 pos;
    pos.x = 1.0 / 2.0 * 1.0e-3;
    pos.y = 1.0 / 2.0 * 1.0e-3;
    pos.z = 1.0 / 2.0 * 1.0e-3;

    float3 speed;
    speed.x = 0.0;
    speed.y = 0.0;
    speed.z = 0.0;

    ParticleProperties *dev_vectorIonisation;
    checkCudaErrors( cudaMalloc( (void**)&dev_vectorIonisation, nbody * sizeof(ParticleProperties) ) );

    ParticleProperties *host_vectorIonisation = new ParticleProperties [nbody];

    clog << "Particles array initialisation...";

    for( int i = 0; i < nbody; i++ ){
        host_vectorIonisation[i].Position.x = drand48() * 1.0e-6 + pos.x;
        host_vectorIonisation[i].Position.y = drand48() * 1.0e-6 + pos.y;
        host_vectorIonisation[i].Position.z = 0.0;

        host_vectorIonisation[i].Velocity.x = speed.x;
        host_vectorIonisation[i].Velocity.y = speed.y;
        host_vectorIonisation[i].Velocity.z = speed.z;

        host_vectorIonisation[i].Force.x = 0.0;
        host_vectorIonisation[i].Force.y = 0.0;
        host_vectorIonisation[i].Force.z = 0.0;
    }

    checkCudaErrors( cudaMemcpy( dev_vectorIonisation, host_vectorIonisation, nbody * sizeof(ParticleProperties), cudaMemcpyHostToDevice ) );

    delete [] host_vectorIonisation;

    clog << "Done" << endl;

    ParticleProperties *dev_Ionisation;
    checkCudaErrors( cudaMalloc( (void**)&dev_Ionisation, Newers * sizeof(ParticleProperties) ) );  

    curandState *RndState;  
    checkCudaErrors( cudaMalloc( (void**)&RndState, El_DIM * El_DIM * sizeof(curandState) ) );

    unsigned long seed = 1773;  

    clog << "cuRand array initialisation...";

    initCurand<<<blocksPerGrid, ThreadsInit>>>( RndState, seed );

    initProcessIoni<<<1, Newers>>>( dev_Ionisation );

    clog << "Done" << endl;

    clog << "Propagation of " << nbody << " primary particle(s)." << endl;

    int ProcessTemp = 0; 

    int nbodyTemp = nbody;

    int blocksInit = (nbody + ThreadsInit - 1) / ThreadsInit;
    int numTiles = (nbody + ThreadsInit - 1) / ThreadsInit;

    size_t sharedMemSize = ThreadsInit * sizeof(float3);

    char buffer[64];

    setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));


    while( nbody > 0 ){
        integrateBodies<<<blocksInit, ThreadsInit, sharedMemSize>>>( dev_vectorIonisation, dt, numTiles, nbodyTemp );

        IntegrationKernel<<<blocksInit, ThreadsInit>>>( dev_vectorIonisation, dt, nbodyTemp );

        ParticleAmplification<<<blocksInit, ThreadsInit>>>( RndState, dev_vectorIonisation, dev_Ionisation, dt, nbodyTemp );
        checkCudaErrors( cudaDeviceSynchronize() );

        Enumerate_Nbody<<<blocksInit, ThreadsInit, blocksInit * sizeof(int)>>>( dev_vectorIonisation, dev_nbody, nbodyTemp );
        checkCudaErrors( cudaDeviceSynchronize() );
        getLastCudaError("Kernel enumerate bodies execution failed");

        checkCudaErrors( cudaMemcpy( &nbody, dev_nbody, sizeof(int), cudaMemcpyDeviceToHost ) );

        nbody += NewParticles;

        if( NewParticles > ProcessTemp ) ProcessTemp = NewParticles;

        if( nbody != nbodyTemp ){
            DynamicAlloc( &dev_vectorIonisation, dev_Ionisation, nbodyTemp, nbody, Newers );

            numTiles = blocksInit = ( nbody + ThreadsInit - 1) / ThreadsInit;

            if( NewParticles != 0 ){
                initProcessIoni<<<1, Newers>>>( dev_Ionisation );
                checkCudaErrors( cudaDeviceSynchronize() );
            }

            nbodyTemp = nbody;

            NewParticles = 0;
            checkCudaErrors( cudaDeviceSynchronize() );
        }
        printf("\r nbodies: %d", nbodyTemp);
    }

    checkCudaErrors( cudaFree( dev_Ionisation ) );
}

This is executed on a GTX Titan Black with compute capability 3.5

Solution

Your problem begins with this line of code (in main):

numTiles = blocksInit = ( nbody + ThreadsInit - 1) / ThreadsInit;

This creates enough tiles to fully cover the size of nbody, but not every tile is fully populated with bodies.

The problem then actually manifests itself at this point in your computeBodyAccel routine, called from integrateBodies:

for( int tile = 0; tile < numTiles; tile++ ){

    sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;

You have no protection on the indexing into positions, and you are assuming that every tile has a valid positions entry for every value of threadIdx.x. But this is not the case, and the first manifestation of trouble can be spotted by compiling your code with -lineinfo and running it with cuda-memcheck. In this case, with the tight memory protection provided by cuda-memcheck, your code (for me) fails at around 500 bodies, not 28000. And the specific failure is an invalid global read of size 4, at the last line of code indicated above. (Therefore, it's not an indexing problem related to the write of shared memory.) Fundamentally, the problem is that tile*blockDim.s + threadIdx.x can exceed nbody, and you index out-of-bounds on the read of positions. (Using -lineinfo to identify the specific kernel line of code that is failing is covered here)

The following limit-checking modifications to your computeBodyAccel routine allow me to run your code up to about 262,000 bodies, where it stops increasing (due to the Ampl limit at El_DIM*El_DIM) and just stays there:

__device__ float3 computeBodyAccel( float3 force, float3 bodyPos, ParticleProperties * positions, const int numTiles, const int nbody ){

    extern __shared__ float3 sharedPos[];

    int computedNbody = 0;

    for( int tile = 0; tile < numTiles; tile++ ){
        if ((tile*blockDim.x + threadIdx.x) < nbody)

          sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x].Position;

        __syncthreads();

        // This is the "tile_calculation" from the GPUG3 article.

        int limit = blockDim.x;
        if (tile = (numTiles - 1)) limit -= (numTiles*blockDim.x)-nbody;
#pragma unroll 128

        for( unsigned int counter = 0; counter < limit; counter++ ){
            force = bodyBodyInteraction(force, bodyPos, sharedPos[counter]);
            computedNbody++;
            if( computedNbody == nbody ) break;
        }
        __syncthreads();
    }

    return force;
}

It also seems that there is an additional problem in your code, even though with the above fixes it seems to run. If you run the code (with the above "fix") with cuda-memcheck and using the -lineinfo method described here, you will discover (due to the tighter memory range checking that cuda-memcheck enforces) that when the number of bodies becomes large, eventually you hit another memory access error in ParticleAmplification when it tries to create a new particle and calls SetVector at the end. It seems you have a race condition between this line:

if( TotalProcesses >= El_DIM * El_DIM - 1 ) Ampl = false;

and the following lines that may increment both TotalProcesses and LocalProcesses:

            atomicAdd( &TotalProcesses, 1 );
            LocalProcesses = atomicAdd( &NewParticles, 1 );

Since you have many threads running in parallel, this type of limit check is useless. You'll need to more carefully manage the establishment of new particles, by checking the actual returned values from the atomicAdd operations and seeing if they exceed limits.