Shared Memory of Objects in CUDA and libc++abi.dylib error

I have the following problem (keep in mind that I am fairly new to programming with CUDA).

I have a class called vec3f that is just like the float3 data type but with overloaded operators, and other vector functions. These functions are prefixed with __device__ __host__. Then, in my kernel I do a nested for loop over block_x and block_y indices and do something like,

//set up shared memory block
extern __shared__ vec3f share[];
vec3f *sh_pos = share;
vec3f *sh_velocity = &sh_pos[blockDim.x*blockDim.y];
sh_pos[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].position();
sh_velocity[blockDim.x * threadIdx.x + threadIdx.y] = oldParticles[index].velocity();
__syncthreads();

In the above code, oldParticles is a pointer to a class called particles that is being passed to the kernel. OldParticles is actually an underlying pointer of a thrust::device_vector<particle> (I'm not sure if this has something to do with it). Everything compiles okay but when I run I get the error

libc++abi.dylib: terminate called throwing an exception
Abort trap: 6

Thanks for the replies. I think the error had to do with me not allocating room for the arguments being passed to my kernel. Doing the following in my host code fixed this error,

particle* particle_ptrs[2];
particle_ptrs[0] = thrust::raw_pointer_cast(&d_old_particles[0]);
particle_ptrs[1] = thrust::raw_pointer_cast(&d_new_particles[0]);
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) ) );
CUDA_SAFE_CALL( cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) ) );

The kernel call is then,

force_kernel<<< grid,block,sharedMemSize  >>>(particle_ptrs[0],particle_ptrs[1],time_step);

The issue that I am having now seems to be that I can't get data copied back to the host from the device. I think this has to do with me not being familiar with Thrust.

I'm doing a series of copies as follows,

//make a host vector assume this is initialized
thrust::host_vector<particle> h_particles;
thrust::device_vector<particle> d_old_particles, d_new_particles;
d_old_particles = h_particles;
//launch kernel as shown above 
//with thrust vectors having been casted into their underlying pointers
//particle_ptrs[1] gets modified and so shouldnt d_new_particles?
//copy back
h_particles = d_new_particles;

So I guess my question is, can I modify a Thrust device_vector in a kernel (in this case particle_pters[0]) save the modification to another Thrust device_vector in the kernel (in this case particle_pters[1]) and then once I exit from the kernel, copy that to a host_vector?

I still can't get this to work. I made a shorter example where I am having the same problem,

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include "vec3f.h"
const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() : 
    _velocity(vec3f(0,0,0)), _position(vec3f(0,0,0)), _density(0.0) {
  };
  particle(const vec3f& pos, const vec3f& vel) :
    _position(pos), _velocity(vel), _density(0.0) {
  };

  vec3f _velocity;
  vec3f _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);

//global thrust vectors
thrust::host_vector<particle> h_parts;
thrust::device_vector<particle> old_parts, new_parts;
particle* particle_ptrs[2];

int main() {
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(vec3f(0.5,0.5,0.5),vec3f(10,10,10)));
  }

  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  cudaMalloc( (void**)&particle_ptrs[0], max_particles * sizeof(particle) );
  cudaMalloc( (void**)&particle_ptrs[1], max_particles * sizeof(particle) );
  //copy host particles to old device particles...
  old_parts = h_parts;
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid(int(sqrt(float(max_particles) / (float(block.x*block.y)))), int(sqrt(float(max_particles) / (float(block.x*block.y)))), 1);
  kernel_func<<<block,grid>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position << endl;
  }  
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();
  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;

  new_parts[arr_pos]._velocity = old_parts[arr_pos]._velocity * 10.0 * dt;
  new_parts[arr_pos]._position = old_parts[arr_pos]._position * 10.0 * dt;
  new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
}

So the host_vector has an initial position of (0.5,0.5,0.5) for all 64 particles. Then the kernel attempts to multiply that by 10 to give (5,5,5) as the position for all particles. But I don't see this when I cout the data. It is still just (0.5,0.5,0.5). Is there a problem with how I am allocating memory? Is there a problem with the lines:

  //copy new device particles back to host particles
  h_parts = new_parts;

What could be the issue? Thank you.

Solution

There are various problems with the code you have posted.

you have your block and grid variables reversed in your kernel invocation. grid comes first.
you should be doing cuda error checking on your kernel and runtime API calls.
your method of allocating storage using cudaMalloc on a pointer which has been raw-cast from an empty device vector is not sensible. The vector container has no knowledge that you did this "under the hood." Instead, you can directly allocate storage for the device vector when you instantiate it, like:
```
thrust::device_vector<particle> old_parts(max_particles), new_parts(max_particles);
```
You say you're expecting 5,5,5, but your kernel is multiplying by 10 and then by dt which is 0.01, so I believe the correct output is 0.05, 0.05, 0.05
Your grid computation (int(sqrt...)), for an arbitrary max_particles either is not guaranteed to produce enough blocks (if casting a float to int truncates or rounds down) or will produce extra blocks (if it rounds up). The round down case is bad. We should handle that by using a ceil function or another grid computation method. The round up case (which is what ceil will do) is OK, but we need to handle the fact that the grid may launch extra blocks/threads. We do that with a thread check in the kernel. There were other problems with the grid computation as well. We want to take the square root of max_particles, then divide it by the block dimension in a particular direction, to get the grid dimension in that direction.

Here's some code that I've modified with these changes in mind, it seems to produce the correct output (0.05, 0.05, 0.05). Note that I had to make some other changes because I don't have your "vec3f.h" header file handy, so I used float3 instead.

#include <iostream>
#include <thrust/host_vector.h>
#include <thrust/device_vector.h>
#include <vector_functions.h>

const int BLOCK_SIZE = 8;
const int max_particles = 64;
const float dt = 0.01;

using namespace std;
//particle class
class particle {
public:
  particle() :
    _velocity(make_float3(0,0,0)), _position(make_float3(0,0,0)), _density(0.0)
 {
  };
  particle(const float3& pos, const float3& vel) :
    _position(pos), _velocity(vel), _density(0.0)
 {
  };

  float3 _velocity;
  float3 _position;
  float _density;
};

//forward declaration of kernel func
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt);


int main() {
  //global thrust vectors
  thrust::host_vector<particle> h_parts;
  particle* particle_ptrs[2];
  //load host vector
  for (int i =0; i<max_particles; i++) {
    h_parts.push_back(particle(make_float3(0.5,0.5,0.5),make_float3(10,10,10)));
  }

  //copy host particles to old device particles...
  thrust::device_vector<particle> old_parts = h_parts;
  thrust::device_vector<particle> new_parts(max_particles);
  particle_ptrs[0] = thrust::raw_pointer_cast(&old_parts[0]);
  particle_ptrs[1] = thrust::raw_pointer_cast(&new_parts[0]);
  //kernel block and grid dimensions
  dim3 block(BLOCK_SIZE,BLOCK_SIZE,1);
  dim3 grid((int)ceil(sqrt(float(max_particles)) / (float(block.x))), (int)ceil(sqrt(float(max_particles)) / (float(block.y))), 1);
  cout << "grid x: " << grid.x << "  grid y: "  << grid.y << endl;
  kernel_func<<<grid,block>>>(particle_ptrs[0],particle_ptrs[1],dt);
  //copy new device particles back to host particles
  cudaDeviceSynchronize();
  h_parts = new_parts;
  for (int i =0; i<max_particles; i++) {
    particle temp1 = h_parts[i];
    cout << temp1._position.x << "," << temp1._position.y << "," << temp1._position.z << endl;
  }
  //delete thrust device vectors
  old_parts.clear();
  old_parts.shrink_to_fit();
  new_parts.clear();
  new_parts.shrink_to_fit();

  return 0;
}

//kernel function
__global__ void kernel_func(particle* old_parts, particle* new_parts, float dt) {
  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
  unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
  //get array position for 2d grid...
  unsigned int arr_pos = y*blockDim.x*gridDim.x + x;
  if (arr_pos < max_particles) {

    new_parts[arr_pos]._velocity.x = old_parts[arr_pos]._velocity.x * 10.0 * dt;
    new_parts[arr_pos]._velocity.y = old_parts[arr_pos]._velocity.y * 10.0 * dt;
    new_parts[arr_pos]._velocity.z = old_parts[arr_pos]._velocity.z * 10.0 * dt;
    new_parts[arr_pos]._position.x = old_parts[arr_pos]._position.x * 10.0 * dt;
    new_parts[arr_pos]._position.y = old_parts[arr_pos]._position.y * 10.0 * dt;
    new_parts[arr_pos]._position.z = old_parts[arr_pos]._position.z * 10.0 * dt;
    new_parts[arr_pos]._density = old_parts[arr_pos]._density * 10.0 * dt;
  }
}